Make lint

2021-12-07 12:51:58 +00:00 · 2021-12-07 12:51:58 +00:00 · c9972e6f14
parent 30cfafce56
commit c9972e6f14
14 changed files with 319 additions and 236 deletions
--- a/TTS/tts/datasets/init.py
+++ b/TTS/tts/datasets/init.py
@ -111,8 +111,8 @@ def load_tts_samples(
            meta_data_eval_all += meta_data_eval
        meta_data_train_all += meta_data_train
        # load attention masks for the duration predictor training
-        if dataset.meta_file_attn_mask:
-            meta_data = dict(load_attention_mask_meta_data(dataset["meta_file_attn_mask"]))
+        if d.meta_file_attn_mask:
+            meta_data = dict(load_attention_mask_meta_data(d["meta_file_attn_mask"]))
            for idx, ins in enumerate(meta_data_train_all):
                attn_file = meta_data[ins["audio_file"]].strip()
                meta_data_train_all[idx].update({"alignment_file": attn_file})
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@ -1,7 +1,6 @@
 import collections
 import os
 import random
-from multiprocessing import Pool
 from typing import Dict, List, Union

 import numpy as np
@ -10,7 +9,6 @@ import tqdm
 from torch.utils.data import Dataset

 from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
-from TTS.tts.utils.text import TTSTokenizer
 from TTS.utils.audio import AudioProcessor


@ -183,7 +181,7 @@ class TTSDataset(Dataset):
    def get_phonemes(self, idx, text):
        out_dict = self.phoneme_dataset[idx]
        assert text == out_dict["text"], f"{text} != {out_dict['text']}"
-        assert out_dict["token_ids"].size > 0
+        assert len(out_dict["token_ids"]) > 0
        return out_dict

    def get_f0(self, idx):
@ -192,7 +190,8 @@ class TTSDataset(Dataset):
        assert wav_file == out_dict["audio_file"]
        return out_dict

-    def get_attn_maks(self, attn_file):
+    @staticmethod
+    def get_attn_mask(attn_file):
        return np.load(attn_file)

    def get_token_ids(self, idx, text):
@ -207,7 +206,7 @@ class TTSDataset(Dataset):

        raw_text = item["text"]

-        wav = np.asarray(self.load_wav(item[]), dtype=np.float32)
+        wav = np.asarray(self.load_wav(item["audio_file"]), dtype=np.float32)

        # apply noise for augmentation
        if self.use_noise_augment:
@ -262,7 +261,7 @@ class TTSDataset(Dataset):
        idxs = np.argsort(lengths)  # ascending order
        ignore_idx = []
        keep_idx = []
-        for i, idx in enumerate(idxs):
+        for idx in idxs:
            length = lengths[idx]
            if length < min_len or length > max_len:
                ignore_idx.append(idx)
@ -277,6 +276,7 @@ class TTSDataset(Dataset):

    @staticmethod
    def create_buckets(samples, batch_group_size: int):
+        assert batch_group_size > 0
        for i in range(len(samples) // batch_group_size):
            offset = i * batch_group_size
            end_offset = offset + batch_group_size
@ -319,7 +319,8 @@ class TTSDataset(Dataset):
        # shuffle batch groups
        # create batches with similar length items
        # the larger the `batch_group_size`, the higher the length variety in a batch.
-        samples = self.create_buckets(samples, self.batch_group_size)
+        if self.batch_group_size > 0:
+            samples = self.create_buckets(samples, self.batch_group_size)

        # update items to the new sorted items
        self.samples = samples
@ -571,6 +572,7 @@ class PhonemeDataset(Dataset):

        We use pytorch dataloader because we are lazy.
        """
+        print("[*] Pre-computing phonemes...")
        with tqdm.tqdm(total=len(self)) as pbar:
            batch_size = num_workers if num_workers > 0 else 1
            dataloder = torch.utils.data.DataLoader(
@ -658,16 +660,21 @@ class F0Dataset:
        return len(self.samples)

    def precompute(self, num_workers=0):
+        print("[*] Pre-computing F0s...")
        with tqdm.tqdm(total=len(self)) as pbar:
            batch_size = num_workers if num_workers > 0 else 1
+            # we do not normalize at preproessing
+            normalize_f0 = self.normalize_f0
+            self.normalize_f0 = False
            dataloder = torch.utils.data.DataLoader(
                batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
            )
            computed_data = []
            for batch in dataloder:
                f0 = batch["f0"]
-                computed_data.append([f for f in f0])
+                computed_data.append(f for f in f0)
                pbar.update(batch_size)
+            self.normalize_f0 = normalize_f0

        if self.normalize_f0:
            computed_data = [tensor for batch in computed_data for tensor in batch]  # flatten
@ -746,80 +753,80 @@ class F0Dataset:
        print(f"{indent}| > Number of instances : {len(self.samples)}")


-if __name__ == "__main__":
-    from torch.utils.data import DataLoader
+# if __name__ == "__main__":
+#     from torch.utils.data import DataLoader

-    from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig
-    from TTS.tts.datasets import load_tts_samples
-    from TTS.tts.utils.text.characters import IPAPhonemes
-    from TTS.tts.utils.text.phonemizers import ESpeak
+#     from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig
+#     from TTS.tts.datasets import load_tts_samples
+#     from TTS.tts.utils.text.characters import IPAPhonemes
+#     from TTS.tts.utils.text.phonemizers import ESpeak

-    dataset_config = BaseDatasetConfig(
-        name="ljspeech",
-        meta_file_train="metadata.csv",
-        path="/Users/erengolge/Projects/TTS/recipes/ljspeech/LJSpeech-1.1",
-    )
-    train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
-    samples = train_samples + eval_samples
+#     dataset_config = BaseDatasetConfig(
+#         name="ljspeech",
+#         meta_file_train="metadata.csv",
+#         path="/Users/erengolge/Projects/TTS/recipes/ljspeech/LJSpeech-1.1",
+#     )
+#     train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+#     samples = train_samples + eval_samples

-    phonemizer = ESpeak(language="en-us")
-    tokenizer = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=phonemizer)
-    # ph_dataset = PhonemeDataset(samples, tokenizer, phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests")
-    # ph_dataset.precompute(num_workers=4)
+#     phonemizer = ESpeak(language="en-us")
+#     tokenizer = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=phonemizer)
+#     # ph_dataset = PhonemeDataset(samples, tokenizer, phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests")
+#     # ph_dataset.precompute(num_workers=4)

-    # dataloader = DataLoader(ph_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=ph_dataset.collate_fn)
-    # for batch in dataloader:
-    #     print(batch)
-    #     break
+#     # dataloader = DataLoader(ph_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=ph_dataset.collate_fn)
+#     # for batch in dataloader:
+#     #     print(batch)
+#     #     break

-    audio_config = BaseAudioConfig(
-        sample_rate=22050,
-        win_length=1024,
-        hop_length=256,
-        num_mels=80,
-        preemphasis=0.0,
-        ref_level_db=20,
-        log_func="np.log",
-        do_trim_silence=True,
-        trim_db=45,
-        mel_fmin=0,
-        mel_fmax=8000,
-        spec_gain=1.0,
-        signal_norm=False,
-        do_amp_to_db_linear=False,
-    )
+#     audio_config = BaseAudioConfig(
+#         sample_rate=22050,
+#         win_length=1024,
+#         hop_length=256,
+#         num_mels=80,
+#         preemphasis=0.0,
+#         ref_level_db=20,
+#         log_func="np.log",
+#         do_trim_silence=True,
+#         trim_db=45,
+#         mel_fmin=0,
+#         mel_fmax=8000,
+#         spec_gain=1.0,
+#         signal_norm=False,
+#         do_amp_to_db_linear=False,
+#     )

-    ap = AudioProcessor.init_from_config(audio_config)
+#     ap = AudioProcessor.init_from_config(audio_config)

-    # f0_dataset = F0Dataset(samples, ap, cache_path="/Users/erengolge/Projects/TTS/f0_tests", verbose=False, precompute_num_workers=4)
+#     # f0_dataset = F0Dataset(samples, ap, cache_path="/Users/erengolge/Projects/TTS/f0_tests", verbose=False, precompute_num_workers=4)

-    # dataloader = DataLoader(f0_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=f0_dataset.collate_fn)
-    # for batch in dataloader:
-    #     print(batch)
-    #     breakpoint()
-    #     break
+#     # dataloader = DataLoader(f0_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=f0_dataset.collate_fn)
+#     # for batch in dataloader:
+#     #     print(batch)
+#     #     breakpoint()
+#     #     break

-    dataset = TTSDataset(
-        outputs_per_step=1,
-        compute_linear_spec=False,
-        samples=samples,
-        ap=ap,
-        return_wav=False,
-        batch_group_size=0,
-        min_seq_len=0,
-        max_seq_len=500,
-        use_noise_augment=False,
-        verbose=True,
-        speaker_id_mapping=None,
-        d_vector_mapping=None,
-        compute_f0=True,
-        f0_cache_path="/Users/erengolge/Projects/TTS/f0_tests",
-        tokenizer=tokenizer,
-        phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests",
-        precompute_num_workers=4,
-    )
+#     dataset = TTSDataset(
+#         outputs_per_step=1,
+#         compute_linear_spec=False,
+#         samples=samples,
+#         ap=ap,
+#         return_wav=False,
+#         batch_group_size=0,
+#         min_seq_len=0,
+#         max_seq_len=500,
+#         use_noise_augment=False,
+#         verbose=True,
+#         speaker_id_mapping=None,
+#         d_vector_mapping=None,
+#         compute_f0=True,
+#         f0_cache_path="/Users/erengolge/Projects/TTS/f0_tests",
+#         tokenizer=tokenizer,
+#         phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests",
+#         precompute_num_workers=4,
+#     )

-    dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn)
-    for batch in dataloader:
-        print(batch)
-        break
+#     dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn)
+#     for batch in dataloader:
+#         print(batch)
+#         break
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -199,10 +199,10 @@ def synthesis(
        wav = model_outputs.squeeze(0)
    else:
        if use_griffin_lim:
-            wav = inv_spectrogram(model_outputs, ap, CONFIG)
+            wav = inv_spectrogram(model_outputs, model.ap, CONFIG)
            # trim silence
            if do_trim_silence:
-                wav = trim_silence(wav, ap)
+                wav = trim_silence(wav, model.ap)
    return_dict = {
        "wav": wav,
        "alignments": alignments,
--- a/TTS/tts/utils/text/characters.py
+++ b/TTS/tts/utils/text/characters.py
@ -1,3 +1,8 @@
+from dataclasses import replace
+
+from TTS.tts.configs.shared_configs import CharactersConfig
+
+
 def parse_symbols():
    return {
        "pad": _pad,
@ -29,46 +34,49 @@ _diacrilics = "ɚ˞ɫ"
 _phonemes = _vowels + _non_pulmonic_consonants + _pulmonic_consonants + _suprasegmentals + _other_symbols + _diacrilics


-def create_graphemes(
-    characters=_characters,
-    punctuations=_punctuations,
-    pad=_pad,
-    eos=_eos,
-    bos=_bos,
-    blank=_blank,
-    unique=True,
-):  # pylint: disable=redefined-outer-name
-    """Function to create default characters and phonemes"""
-    # create graphemes
-    _graphemes = list(characters)
-    _graphemes = [bos] + _graphemes if len(bos) > 0 and bos is not None else _graphemes
-    _graphemes = [eos] + _graphemes if len(bos) > 0 and eos is not None else _graphemes
-    _graphemes = [pad] + _graphemes if len(bos) > 0 and pad is not None else _graphemes
-    _graphemes = [blank] + _graphemes if len(bos) > 0 and blank is not None else _graphemes
-    _graphemes = _graphemes + list(punctuations)
-    return _graphemes, _phonemes
+# def create_graphemes(
+#     characters=_characters,
+#     punctuations=_punctuations,
+#     pad=_pad,
+#     eos=_eos,
+#     bos=_bos,
+#     blank=_blank,
+#     unique=True,
+# ):  # pylint: disable=redefined-outer-name
+#     """Function to create default characters and phonemes"""
+#     # create graphemes
+#     = (
+#         sorted(list(set(phonemes))) if unique else sorted(list(phonemes))
+#     )  # this is to keep previous models compatible.
+#     _graphemes = list(characters)
+#     _graphemes = [bos] + _graphemes if len(bos) > 0 and bos is not None else _graphemes
+#     _graphemes = [eos] + _graphemes if len(bos) > 0 and eos is not None else _graphemes
+#     _graphemes = [pad] + _graphemes if len(bos) > 0 and pad is not None else _graphemes
+#     _graphemes = [blank] + _graphemes if len(bos) > 0 and blank is not None else _graphemes
+#     _graphemes = _graphemes + list(punctuations)
+#     return _graphemes, _phonemes


-def create_phonemes(
-    phonemes=_phonemes, punctuations=_punctuations, pad=_pad, eos=_eos, bos=_bos, blank=_blank, unique=True
-):
-    # create phonemes
-    _phonemes = None
-    _phonemes_sorted = (
-        sorted(list(set(phonemes))) if unique else sorted(list(phonemes))
-    )  # this is to keep previous models compatible.
-    _phonemes = list(_phonemes_sorted)
-    _phonemes = [bos] + _phonemes if len(bos) > 0 and bos is not None else _phonemes
-    _phonemes = [eos] + _phonemes if len(bos) > 0 and eos is not None else _phonemes
-    _phonemes = [pad] + _phonemes if len(bos) > 0 and pad is not None else _phonemes
-    _phonemes = [blank] + _phonemes if len(bos) > 0 and blank is not None else _phonemes
-    _phonemes = _phonemes + list(punctuations)
-    _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
-    return _phonemes
+# def create_phonemes(
+#     phonemes=_phonemes, punctuations=_punctuations, pad=_pad, eos=_eos, bos=_bos, blank=_blank, unique=True
+# ):
+#     # create phonemes
+#     _phonemes = None
+#     _phonemes_sorted = (
+#         sorted(list(set(phonemes))) if unique else sorted(list(phonemes))
+#     )  # this is to keep previous models compatible.
+#     _phonemes = list(_phonemes_sorted)
+#     _phonemes = [bos] + _phonemes if len(bos) > 0 and bos is not None else _phonemes
+#     _phonemes = [eos] + _phonemes if len(bos) > 0 and eos is not None else _phonemes
+#     _phonemes = [pad] + _phonemes if len(bos) > 0 and pad is not None else _phonemes
+#     _phonemes = [blank] + _phonemes if len(bos) > 0 and blank is not None else _phonemes
+#     _phonemes = _phonemes + list(punctuations)
+#     _phonemes = [pad, eos, bos] + list(_phonemes_sorted) + list(punctuations)
+#     return _phonemes


-graphemes = create_graphemes(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
-phonemes = create_phonemes(_phonemes, _punctuations, _pad, _eos, _bos, _blank)
+# DEF_GRAPHEMES = create_graphemes(_characters, _phonemes, _punctuations, _pad, _eos, _bos)
+# DEF_PHONEMES = create_phonemes(_phonemes, _punctuations, _pad, _eos, _bos, _blank)


 class BaseCharacters:
@ -114,7 +122,7 @@ class BaseCharacters:
        eos: str,
        bos: str,
        blank: str,
-        is_unique: bool = True,
+        is_unique: bool = False,
        is_sorted: bool = True,
    ) -> None:
        self._characters = characters
@ -202,14 +210,20 @@ class BaseCharacters:
        _vocab = [self._pad] + _vocab if self._pad is not None and len(self._pad) > 0 else _vocab
        self._vocab = _vocab + list(self._punctuations)
        self._char_to_id = {char: idx for idx, char in enumerate(self.vocab)}
-        self._id_to_char = {idx: char for idx, char in enumerate(self.vocab)}
+        self._id_to_char = {
+            idx: char for idx, char in enumerate(self.vocab)  # pylint: disable=unnecessary-comprehension
+        }
        if self.is_unique:
+            duplicates = {x for x in self.vocab if self.vocab.count(x) > 1}
            assert (
                len(self.vocab) == len(self._char_to_id) == len(self._id_to_char)
-            ), f" [!] There are duplicate characters in the character set. {set([x for x in self.vocab if self.vocab.count(x) > 1])}"
+            ), f" [!] There are duplicate characters in the character set. {duplicates}"

    def char_to_id(self, char: str) -> int:
-        return self._char_to_id[char]
+        try:
+            return self._char_to_id[char]
+        except KeyError as e:
+            raise KeyError(f" [!] {repr(char)} is not in the vocabulary.") from e

    def id_to_char(self, idx: int) -> str:
        return self._id_to_char[idx]
@ -229,9 +243,23 @@ class BaseCharacters:
        print(f"{indent}| > Num chars: {self.num_chars}")

    @staticmethod
-    def init_from_config(config: "Coqpit"):
-        return BaseCharacters(
-            **config.characters if config.characters is not None else {},
+    def init_from_config(config: "Coqpit"):  # pylint: disable=unused-argument
+        """Init your character class from a config.
+
+        Implement this method for your subclass.
+        """
+        ...
+
+    def to_config(self) -> "CharactersConfig":
+        return CharactersConfig(
+            characters=self._characters,
+            punctuations=self._punctuations,
+            pad=self._pad,
+            eos=self._eos,
+            bos=self._bos,
+            blank=self._blank,
+            is_unique=self.is_unique,
+            is_sorted=self.is_sorted,
        )


@ -275,31 +303,42 @@ class IPAPhonemes(BaseCharacters):
        eos: str = _eos,
        bos: str = _bos,
        blank: str = _blank,
-        is_unique: bool = True,
+        is_unique: bool = False,
        is_sorted: bool = True,
    ) -> None:
        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)

    @staticmethod
    def init_from_config(config: "Coqpit"):
+        """Init a IPAPhonemes object from a model config
+
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
        # band-aid for compatibility with old models
        if "characters" in config and config.characters is not None:
            if "phonemes" in config.characters and config.characters.phonemes is not None:
                config.characters["characters"] = config.characters["phonemes"]
-            return IPAPhonemes(
-                characters=config.characters["characters"],
-                punctuations=config.characters["punctuations"],
-                pad=config.characters["pad"],
-                eos=config.characters["eos"],
-                bos=config.characters["bos"],
-                blank=config.characters["blank"],
-                is_unique=config.characters["is_unique"],
-                is_sorted=config.characters["is_sorted"],
-            )
-        else:
-            return IPAPhonemes(
-                **config.characters if config.characters is not None else {},
+            return (
+                IPAPhonemes(
+                    characters=config.characters["characters"],
+                    punctuations=config.characters["punctuations"],
+                    pad=config.characters["pad"],
+                    eos=config.characters["eos"],
+                    bos=config.characters["bos"],
+                    blank=config.characters["blank"],
+                    is_unique=config.characters["is_unique"],
+                    is_sorted=config.characters["is_sorted"],
+                ),
+                config,
            )
+        # use character set from config
+        if config.characters is not None:
+            return IPAPhonemes(**config.characters), config
+        # return default character set
+        characters = IPAPhonemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config


 class Graphemes(BaseCharacters):
@ -339,24 +378,42 @@ class Graphemes(BaseCharacters):
        eos: str = _eos,
        bos: str = _bos,
        blank: str = _blank,
-        is_unique: bool = True,
+        is_unique: bool = False,
        is_sorted: bool = True,
    ) -> None:
        super().__init__(characters, punctuations, pad, eos, bos, blank, is_unique, is_sorted)

    @staticmethod
    def init_from_config(config: "Coqpit"):
-        return Graphemes(
-            **config.characters if config.characters is not None else {},
-        )
+        """Init a Graphemes object from a model config
+
+        If characters are not defined in the config, it will be set to the default characters and the config
+        will be updated.
+        """
+        if config.characters is not None:
+            # band-aid for compatibility with old models
+            if "phonemes" in config.characters:
+                return (
+                    Graphemes(
+                        characters=config.characters["characters"],
+                        punctuations=config.characters["punctuations"],
+                        pad=config.characters["pad"],
+                        eos=config.characters["eos"],
+                        bos=config.characters["bos"],
+                        blank=config.characters["blank"],
+                        is_unique=config.characters["is_unique"],
+                        is_sorted=config.characters["is_sorted"],
+                    ),
+                    config,
+                )
+            return Graphemes(**config.characters), config
+        characters = Graphemes()
+        new_config = replace(config, characters=characters.to_config())
+        return characters, new_config


 if __name__ == "__main__":
    gr = Graphemes()
    ph = IPAPhonemes()
-
-    print(gr.vocab)
-    print(ph.vocab)
-
-    print(gr.num_chars)
-    assert "a" == gr.id_to_char(gr.char_to_id("a"))
+    gr.print_log()
+    ph.print_log()
--- a/TTS/tts/utils/text/phonemizers/base.py
+++ b/TTS/tts/utils/text/phonemizers/base.py
@ -1,6 +1,5 @@
 import abc
-import itertools
-from typing import List, Tuple, Union
+from typing import List, Tuple

 from TTS.tts.utils.text.punctuation import Punctuation

@ -8,6 +7,19 @@ from TTS.tts.utils.text.punctuation import Punctuation
 class BasePhonemizer(abc.ABC):
    """Base phonemizer class

+    Phonemization follows the following steps:
+        1. Preprocessing:
+            - remove empty lines
+            - remove punctuation
+            - keep track of punctuation marks
+
+        2. Phonemization:
+            - convert text to phonemes
+
+        3. Postprocessing:
+            - join phonemes
+            - restore punctuation marks
+
    Args:
        language (str):
            Language used by the phonemizer.
@ -51,40 +63,30 @@ class BasePhonemizer(abc.ABC):
    @abc.abstractmethod
    def name():
        """The name of the backend"""
+        ...

    @classmethod
    @abc.abstractmethod
    def is_available(cls):
        """Returns True if the backend is installed, False otherwise"""
+        ...

    @classmethod
    @abc.abstractmethod
    def version(cls):
        """Return the backend version as a tuple (major, minor, patch)"""
+        ...

+    @staticmethod
    @abc.abstractmethod
    def supported_languages():
        """Return a dict of language codes -> name supported by the backend"""
+        ...

    def is_supported_language(self, language):
        """Returns True if `language` is supported by the backend"""
        return language in self.supported_languages()

-    fr"""
-        Phonemization follows the following steps:
-            1. Preprocessing:
-                - remove empty lines
-                - remove punctuation
-                - keep track of punctuation marks
-
-            2. Phonemization:
-                - convert text to phonemes
-
-            3. Postprocessing:
-                - join phonemes
-                - restore punctuation marks
-    """
-
    @abc.abstractmethod
    def _phonemize(self, text, separator):
        """The main phonemization method"""
--- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
@ -28,29 +28,30 @@ def _espeak_exe(espeak_lib: str, args: List, sync=False) -> List[str]:
        "1",  # UTF8 text encoding
    ]
    cmd.extend(args)
-    logging.debug("espeakng: executing %s" % repr(cmd))
-    p = subprocess.Popen(
+    logging.debug("espeakng: executing %s", repr(cmd))
+
+    with subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
-    )
-    res = iter(p.stdout.readline, b"")
-    if not sync:
+    ) as p:
+        res = iter(p.stdout.readline, b"")
+        if not sync:
+            p.stdout.close()
+            if p.stderr:
+                p.stderr.close()
+            if p.stdin:
+                p.stdin.close()
+            return res
+        res2 = []
+        for line in res:
+            res2.append(line)
        p.stdout.close()
        if p.stderr:
            p.stderr.close()
        if p.stdin:
            p.stdin.close()
-        return res
-    res2 = []
-    for line in res:
-        res2.append(line)
-    p.stdout.close()
-    if p.stderr:
-        p.stderr.close()
-    if p.stdin:
-        p.stdin.close()
-    p.wait()
+        p.wait()
    return res2


@ -85,7 +86,24 @@ class ESpeak(BasePhonemizer):
    def __init__(self, language: str, backend=None, punctuations=Punctuation.default_puncs(), keep_puncs=True):
        if self._ESPEAK_LIB is None:
            raise Exception("Unknown backend: %s" % backend)
+
+        # band-aid for backwards compatibility
+        if language == "en":
+            language = "en-us"
+
        super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs)
+        if backend is not None:
+            self.backend = backend
+
+    @property
+    def backend(self):
+        return self._ESPEAK_LIB
+
+    @backend.setter
+    def backend(self, backend):
+        if backend not in ["espeak", "espeak-ng"]:
+            raise Exception("Unknown backend: %s" % backend)
+        self._ESPEAK_LIB = backend

    def auto_set_espeak_lib(self) -> None:
        if is_tool("espeak-ng"):
@ -115,24 +133,25 @@ class ESpeak(BasePhonemizer):
        # espeak and espeak-ng parses `ipa` differently
        if tie:
            # use '͡' between phonemes
-            if _DEF_ESPEAK_LIB == "espeak":
+            if self.backend == "espeak":
                args.append("--ipa=1")
            else:
                args.append("--ipa=3")
        else:
            # split with '_'
-            if _DEF_ESPEAK_LIB == "espeak":
+            if self.backend == "espeak":
                args.append("--ipa=3")
            else:
                args.append("--ipa=1")
        if tie:
            args.append("--tie=%s" % tie)
+
        args.append('"' + text + '"')
        # compute phonemes
        phonemes = ""
        for line in _espeak_exe(self._ESPEAK_LIB, args, sync=True):
-            logging.debug("line: %s" % repr(line))
-            phonemes += line.decode("utf8").strip()
+            logging.debug("line: %s", repr(line))
+            phonemes += line.decode("utf8").strip()[2:]  # skip two redundant characters
        return phonemes.replace("_", separator)

    def _phonemize(self, text, separator=None):
@ -146,7 +165,7 @@ class ESpeak(BasePhonemizer):
            Dict: Dictionary of language codes.
        """
        if _DEF_ESPEAK_LIB is None:
-            raise {}
+            return {}
        args = ["--voices"]
        langs = {}
        count = 0
@ -157,7 +176,7 @@ class ESpeak(BasePhonemizer):
                lang_code = cols[1]
                lang_name = cols[3]
                langs[lang_code] = lang_name
-            logging.debug("line: %s" % repr(line))
+            logging.debug("line: %s", repr(line))
            count += 1
        return langs

@ -168,9 +187,9 @@ class ESpeak(BasePhonemizer):
            str: Version of the used backend.
        """
        args = ["--version"]
-        for line in _espeak_exe(_DEF_ESPEAK_LIB, args, sync=True):
+        for line in _espeak_exe(self.backend, args, sync=True):
            version = line.decode("utf8").strip().split()[2]
-            logging.debug("line: %s" % repr(line))
+            logging.debug("line: %s", repr(line))
            return version

    @classmethod
--- a/TTS/tts/utils/text/phonemizers/gruut_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/gruut_wrapper.py
@ -1,5 +1,4 @@
 import importlib
-from os import stat
 from typing import List

 import gruut
@ -55,7 +54,7 @@ class Gruut(BasePhonemizer):
    def name():
        return "gruut"

-    def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str:
+    def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str:  # pylint: disable=unused-argument
        """Convert input text to phonemes.

        Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters
--- a/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/ja_jp_phonemizer.py
@ -30,7 +30,7 @@ class JA_JP_Phonemizer(BasePhonemizer):

    language = "ja-jp"

-    def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs):
+    def __init__(self, punctuations=_DEF_JA_PUNCS, keep_puncs=True, **kwargs):  # pylint: disable=unused-argument
        super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)

    @staticmethod
@ -61,12 +61,12 @@ class JA_JP_Phonemizer(BasePhonemizer):
        return True


-if __name__ == "__main__":
-    text = "これは、電話をかけるための私の日本語の例のテキストです。"
-    e = JA_JP_Phonemizer()
-    print(e.supported_languages())
-    print(e.version())
-    print(e.language)
-    print(e.name())
-    print(e.is_available())
-    print("`" + e.phonemize(text) + "`")
+# if __name__ == "__main__":
+#     text = "これは、電話をかけるための私の日本語の例のテキストです。"
+#     e = JA_JP_Phonemizer()
+#     print(e.supported_languages())
+#     print(e.version())
+#     print(e.language)
+#     print(e.name())
+#     print(e.is_available())
+#     print("`" + e.phonemize(text) + "`")
--- a/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/multi_phonemizer.py
@ -17,7 +17,7 @@ class MultiPhonemizer:
    lang_to_phonemizer_name = DEF_LANG_TO_PHONEMIZER
    language = "multi-lingual"

-    def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None:
+    def __init__(self, custom_lang_to_phonemizer: Dict = {}) -> None:  # pylint: disable=dangerous-default-value
        self.lang_to_phonemizer_name.update(custom_lang_to_phonemizer)
        self.lang_to_phonemizer = self.init_phonemizers(self.lang_to_phonemizer_name)

@ -40,16 +40,16 @@ class MultiPhonemizer:
        return list(self.lang_to_phonemizer_name.keys())


-if __name__ == "__main__":
-    texts = {
-        "tr": "Merhaba, bu Türkçe bit örnek!",
-        "en-us": "Hello, this is English example!",
-        "de": "Hallo, das ist ein Deutches Beipiel!",
-        "zh-cn": "这是中国的例子",
-    }
-    phonemes = {}
-    ph = MultiPhonemizer()
-    for lang, text in texts.items():
-        phoneme = ph.phonemize(text, lang)
-        phonemes[lang] = phoneme
-    print(phonemes)
+# if __name__ == "__main__":
+#     texts = {
+#         "tr": "Merhaba, bu Türkçe bit örnek!",
+#         "en-us": "Hello, this is English example!",
+#         "de": "Hallo, das ist ein Deutches Beipiel!",
+#         "zh-cn": "这是中国的例子",
+#     }
+#     phonemes = {}
+#     ph = MultiPhonemizer()
+#     for lang, text in texts.items():
+#         phoneme = ph.phonemize(text, lang)
+#         phonemes[lang] = phoneme
+#     print(phonemes)
--- a/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
+++ b/TTS/tts/utils/text/phonemizers/zh_cn_phonemizer.py
@ -25,14 +25,15 @@ class ZH_CN_Phonemizer(BasePhonemizer):

    language = "zh-cn"

-    def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs):
+    def __init__(self, punctuations=_DEF_ZH_PUNCS, keep_puncs=False, **kwargs):  # pylint: disable=unused-argument
        super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs)

    @staticmethod
    def name():
        return "zh_cn_phonemizer"

-    def phonemize_zh_cn(self, text: str, separator: str = "|") -> str:
+    @staticmethod
+    def phonemize_zh_cn(text: str, separator: str = "|") -> str:
        ph = chinese_text_to_phonemes(text, separator)
        return ph

@ -50,12 +51,12 @@ class ZH_CN_Phonemizer(BasePhonemizer):
        return True


-if __name__ == "__main__":
-    text = "这是，样本中文。"
-    e = ZH_CN_Phonemizer()
-    print(e.supported_languages())
-    print(e.version())
-    print(e.language)
-    print(e.name())
-    print(e.is_available())
-    print("`" + e.phonemize(text) + "`")
+# if __name__ == "__main__":
+#     text = "这是，样本中文。"
+#     e = ZH_CN_Phonemizer()
+#     print(e.supported_languages())
+#     print(e.version())
+#     print(e.language)
+#     print(e.name())
+#     print(e.is_available())
+#     print("`" + e.phonemize(text) + "`")
--- a/TTS/tts/utils/text/punctuation.py
+++ b/TTS/tts/utils/text/punctuation.py
@ -130,7 +130,7 @@ class Punctuation:
        return cls._restore(text, puncs, 0)

    @classmethod
-    def _restore(cls, text, puncs, num):
+    def _restore(cls, text, puncs, num):  # pylint: disable=too-many-return-statements
        """Auxiliary method for Punctuation.restore()"""
        if not puncs:
            return text
@ -159,14 +159,14 @@ class Punctuation:
        return cls._restore([text[0] + current.punc + text[1]] + text[2:], puncs[1:], num)


-if __name__ == "__main__":
-    punc = Punctuation()
-    text = "This is. This is, example!"
+# if __name__ == "__main__":
+#     punc = Punctuation()
+#     text = "This is. This is, example!"

-    print(punc.strip(text))
+#     print(punc.strip(text))

-    split_text, puncs = punc.strip_to_restore(text)
-    print(split_text, " ---- ", puncs)
+#     split_text, puncs = punc.strip_to_restore(text)
+#     print(split_text, " ---- ", puncs)

-    restored_text = punc.restore(split_text, puncs)
-    print(restored_text)
+#     restored_text = punc.restore(split_text, puncs)
+#     print(restored_text)
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@ -383,8 +383,7 @@ class AudioProcessor(object):
    def init_from_config(config: "Coqpit"):
        if "audio" in config:
            return AudioProcessor(**config.audio)
-        else:
-            return AudioProcessor(**config)
+        return AudioProcessor(**config)

    ### setting up the parameters ###
    def _build_mel_basis(
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -13,7 +13,6 @@ from TTS.tts.utils.speakers import SpeakerManager
 # pylint: disable=unused-wildcard-import
 # pylint: disable=wildcard-import
 from TTS.tts.utils.synthesis import synthesis, trim_silence
-from TTS.tts.utils.text import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 from TTS.vocoder.models import setup_model as setup_vocoder_model
 from TTS.vocoder.utils.generic_utils import interpolate_vocoder_input
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@ -314,7 +314,7 @@ class GAN(BaseVocoder):
        data_items: List,
        verbose: bool,
        num_gpus: int,
-        rank: int = 0,  # pylint: disable=unused-argument
+        rank: int = None,  # pylint: disable=unused-argument
    ):
        """Initiate and return the GAN dataloader.