Merge branch 'dev-managers' into dev-emotion

2022-03-30 16:25:47 -03:00 · 2022-03-30 16:25:47 -03:00 · aebbdfc62b
parent 34a92f1b1b 397b3e9baf
commit aebbdfc62b
74 changed files with 566 additions and 516 deletions
--- a/.github/workflows/text_tests.yml
+++ b/.github/workflows/text_tests.yml
@ -1,4 +1,4 @@
-name: tts-tests
+name: text-tests

 on:
  push:
--- a/.gitignore
+++ b/.gitignore
@ -115,6 +115,7 @@ venv.bak/
 *.swo

 # pytorch models
+*.pth
 *.pth.tar
 result/

--- a/CITATION.cff
+++ b/CITATION.cff
@ -0,0 +1,20 @@
+cff-version: 1.2.0
+message: "If you want to cite 🐸💬, feel free to use this (but only if you loved it 😊)"
+title: "Coqui TTS"
+abstract: "A deep learning toolkit for Text-to-Speech, battle-tested in research and production"
+date-released: 2021-01-01
+authors:
+  - family-names: "Eren"
+    given-names: "Gölge"
+  - name: "The Coqui TTS Team"
+version: 1.4
+doi: 10.5281/zenodo.6334862
+license: "MPL-2.0"
+url: "https://www.coqui.ai"
+repository-code: "https://github.com/coqui-ai/TTS"
+keywords:
+  - machine learning
+  - deep learning
+  - artificial intelligence
+  - text to speech
+  - TTS
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,6 +1,7 @@
 include README.md
 include LICENSE.txt
 include requirements.*.txt
+include *.cff
 include requirements.txt
 include TTS/VERSION
 recursive-include TTS *.json
--- a/2
+++ b/2
@ -44,6 +44,8 @@ style:	## update code style.

 lint:	## run pylint linter.
 	pylint ${target_dirs}
+	black ${target_dirs} --check
+	isort ${target_dirs} --check-only

 system-deps:	## install linux system deps
 	sudo apt-get install -y libsndfile1-dev
--- a/README.md
+++ b/README.md
@ -159,13 +159,13 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
 - Run your own TTS model (Using Griffin-Lim Vocoder):

    ```
-    $ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
+    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
    ```

 - Run your own TTS and Vocoder models:
    ```
-    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
-        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
+    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
+        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
    ```

 ### Multi-speaker Models
@ -185,7 +185,7 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
 - Run your own multi-speaker TTS model:

    ```
-    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```

 ## Directory Structure
--- a/TTS/bin/compute_attention_masks.py
+++ b/TTS/bin/compute_attention_masks.py
@ -25,7 +25,7 @@ These masks can be used for different purposes including training a TTS model wi
        """
 Example run:
    CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
-        --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth.tar
+        --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
        --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
        --dataset_metafile metadata.csv
        --data_path /root/LJSpeech-1.1/
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@ -13,7 +13,7 @@ parser = argparse.ArgumentParser(
    description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
    """
    Example runs:
-    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth.tar speaker_encoder_config.json  dataset_config.json embeddings_output_path/
+    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth speaker_encoder_config.json  dataset_config.json embeddings_output_path/
    """,
    formatter_class=RawTextHelpFormatter,
 )
--- a/TTS/bin/distribute.py
+++ b/TTS/bin/distribute.py
@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-import os
-import pathlib
-import subprocess
-import time
-
-import torch
-from trainer import TrainerArgs
-
-
-def main():
-    """
-    Call train.py as a new process and pass command arguments
-    """
-    parser = TrainerArgs().init_argparse(arg_prefix="")
-    parser.add_argument("--script", type=str, help="Target training script to distibute.")
-    args, unargs = parser.parse_known_args()
-
-    num_gpus = torch.cuda.device_count()
-    group_id = time.strftime("%Y_%m_%d-%H%M%S")
-
-    # set arguments for train.py
-    folder_path = pathlib.Path(__file__).parent.absolute()
-    if os.path.exists(os.path.join(folder_path, args.script)):
-        command = [os.path.join(folder_path, args.script)]
-    else:
-        command = [args.script]
-    command.append("--continue_path={}".format(args.continue_path))
-    command.append("--restore_path={}".format(args.restore_path))
-    command.append("--config_path={}".format(args.config_path))
-    command.append("--group_id=group_{}".format(group_id))
-    command.append("--use_ddp=true")
-    command += unargs
-    command.append("")
-
-    # run processes
-    processes = []
-    for i in range(num_gpus):
-        my_env = os.environ.copy()
-        my_env["PYTHON_EGG_CACHE"] = "/tmp/tmp{}".format(i)
-        command[-1] = "--rank={}".format(i)
-        # prevent stdout for processes with rank != 0
-        stdout = None
-        p = subprocess.Popen(["python3"] + command, stdout=stdout, env=my_env)  # pylint: disable=consider-using-with
-        processes.append(p)
-        print(command)
-
-    for p in processes:
-        p.wait()
-
-
-if __name__ == "__main__":
-    main()
--- a/TTS/bin/eval_encoder.py
+++ b/TTS/bin/eval_encoder.py
@ -1,18 +1,18 @@
 import argparse
-import torch
 from argparse import RawTextHelpFormatter

+import torch
 from tqdm import tqdm

 from TTS.config import load_config
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.utils.speakers import SpeakerManager

+
 def compute_encoder_accuracy(dataset_items, encoder_manager):

    class_name_key = encoder_manager.encoder_config.class_name_key
    map_classid_to_classname = getattr(encoder_manager.encoder_config, 'map_classid_to_classname', None)
-
    class_acc_dict = {}

    # compute embeddings for all wav_files
@ -43,11 +43,11 @@ def compute_encoder_accuracy(dataset_items, encoder_manager):

    acc_avg = 0
    for key, values in class_acc_dict.items():
-        acc = sum(values)/len(values)
+        acc = sum(values) / len(values)
        print("Class", key, "Accuracy:", acc)
        acc_avg += acc

-    print("Average Accuracy:", acc_avg/len(class_acc_dict))
+    print("Average Accuracy:", acc_avg / len(class_acc_dict))


 if __name__ == "__main__":
@ -55,7 +55,7 @@ if __name__ == "__main__":
        description="""Compute the accuracy of the encoder.\n\n"""
        """
        Example runs:
-        python TTS/bin/eval_encoder.py emotion_encoder_model.pth.tar emotion_encoder_config.json  dataset_config.json
+        python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json  dataset_config.json
        """,
        formatter_class=RawTextHelpFormatter,
    )
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -60,13 +60,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
 - Run your own TTS model (Using Griffin-Lim Vocoder):

    ```
-    $ tts --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
+    $ tts --text "Text for TTS" --model_path path/to/model.pth --config_path path/to/config.json --out_path output/path/speech.wav
    ```

 - Run your own TTS and Vocoder models:
    ```
-    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
-        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
+    $ tts --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth --out_path output/path/speech.wav
+        --vocoder_path path/to/vocoder.pth --vocoder_config_path path/to/vocoder_config.json
    ```

 ### Multi-speaker Models
@ -86,7 +86,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
 - Run your own multi-speaker TTS model:

    ```
-    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+    $ tts --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
    ```
    """
    # We remove Markdown code formatting programmatically here to allow us to copy-and-paste from main README to keep
@ -217,7 +217,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
    args = parser.parse_args()

    # print the description if either text or list_models is not set
-    if not args.text and not args.list_models and not args.list_speaker_idxs and not args.list_language_idxs and not args.reference_wav:
+    if (
+        not args.text
+        and not args.list_models
+        and not args.list_speaker_idxs
+        and not args.list_language_idxs
+        and not args.reference_wav
+    ):
        parser.parse_args(["-h"])

    # load model manager
@ -306,7 +312,15 @@ If you don't specify any models, then it uses LJSpeech based English model.
        print(" > Text: {}".format(args.text))

    # kick it
-    wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav, reference_wav=args.reference_wav, reference_speaker_name=args.reference_speaker_idx, emotion_name=args.emotion_idx)
+    wav = synthesizer.tts(
+        args.text,
+        args.speaker_idx,
+        args.language_idx,
+        args.speaker_wav,
+        reference_wav=args.reference_wav,
+        reference_speaker_name=args.reference_speaker_idx,
+        emotion_name=args.emotion_idx
+    )

    # save the results
    print(" > Saving output to {}".format(args.out_path))
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@ -9,6 +9,7 @@ import traceback
 import torch
 from torch.utils.data import DataLoader
 from trainer.torch import NoamLR
+from trainer.trainer_utils import get_optimizer

 from TTS.encoder.dataset import EncoderDataset
 from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
@ -19,7 +20,6 @@ from TTS.tts.datasets import load_tts_samples
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
 from TTS.utils.io import copy_model_files
-from trainer.trainer_utils import get_optimizer
 from TTS.utils.training import check_update

 torch.backends.cudnn.enabled = True
@ -52,16 +52,21 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False
    sampler = PerfectBatchSampler(
        dataset.items,
        classes,
-        batch_size=num_classes_in_batch*num_utter_per_class, # total batch size
+        batch_size=num_classes_in_batch * num_utter_per_class,  # total batch size
        num_classes_in_batch=num_classes_in_batch,
        num_gpus=1,
        shuffle=not is_val,
-        drop_last=True)
+        drop_last=True,
+    )

    if len(classes) < num_classes_in_batch:
        if is_val:
-            raise RuntimeError(f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !")
-        raise RuntimeError(f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !")
+            raise RuntimeError(
+                f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
+            )
+        raise RuntimeError(
+            f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
+        )

    # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
    if is_val:
@ -76,6 +81,7 @@ def setup_loader(ap: AudioProcessor, is_val: bool = False, verbose: bool = False

    return loader, classes, dataset.get_map_classid_to_classname()

+
 def evaluation(model, criterion, data_loader, global_step):
    eval_loss = 0
    for _, data in enumerate(data_loader):
@ -84,8 +90,12 @@ def evaluation(model, criterion, data_loader, global_step):
            inputs, labels = data

            # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
-            labels = torch.transpose(labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1).reshape(labels.shape)
-            inputs = torch.transpose(inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
+            labels = torch.transpose(
+                labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
+            ).reshape(labels.shape)
+            inputs = torch.transpose(
+                inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
+            ).reshape(inputs.shape)

            # dispatch data to GPU
            if use_cuda:
@ -96,20 +106,23 @@ def evaluation(model, criterion, data_loader, global_step):
            outputs = model(inputs)

            # loss computation
-            loss = criterion(outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels)
+            loss = criterion(
+                outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
+            )

            eval_loss += loss.item()

-    eval_avg_loss = eval_loss/len(data_loader)
+    eval_avg_loss = eval_loss / len(data_loader)
    # save stats
    dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
    # plot the last batch in the evaluation
    figures = {
-            "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
+        "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
    }
    dashboard_logger.eval_figures(global_step, figures)
    return eval_avg_loss

+
 def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
    model.train()
    best_loss = float("inf")
@ -124,8 +137,12 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
            # setup input data
            inputs, labels = data
            # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
-            labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
-            inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
+            labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
+                labels.shape
+            )
+            inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
+                inputs.shape
+            )
            # ToDo: move it to a unit test
            # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
            # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
@ -157,7 +174,9 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
            outputs = model(inputs)

            # loss computation
-            loss = criterion(outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels)
+            loss = criterion(
+                outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
+            )
            loss.backward()
            grad_norm, _ = check_update(model, c.grad_clip)
            optimizer.step()
@ -211,7 +230,7 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
        print(
            ">>> Epoch:{}  AvgLoss: {:.5f} GradNorm:{:.5f}  "
            "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
-                epoch, tot_loss/len(data_loader), grad_norm, epoch_time, avg_loader_time
+                epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
            ),
            flush=True,
        )
@ -222,10 +241,8 @@ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader,
            print("\n\n")
            print("--> EVAL PERFORMANCE")
            print(
-            "   | > Epoch:{}  AvgLoss: {:.5f} ".format(
-                epoch, eval_loss
-            ),
-            flush=True,
+                "   | > Epoch:{}  AvgLoss: {:.5f} ".format(epoch, eval_loss),
+                flush=True,
            )
            # save the best checkpoint
            best_loss = save_best_model(model, optimizer, criterion, eval_loss, best_loss, OUT_PATH, global_step, epoch)
@ -262,7 +279,9 @@ def main(args):  # pylint: disable=redefined-outer-name
        copy_model_files(c, OUT_PATH)

    if args.restore_path:
-        criterion, args.restore_step = model.load_checkpoint(c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion)
+        criterion, args.restore_step = model.load_checkpoint(
+            c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
+        )
        print(" > Model restored from step %d" % args.restore_step, flush=True)
    else:
        args.restore_step = 0
--- a/TTS/encoder/README.md
+++ b/TTS/encoder/README.md
@ -14,5 +14,5 @@ To run the code, you need to follow the same flow as in TTS.

 - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
 - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth.tar model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
+- Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda true /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
 - Watch training on Tensorboard as in TTS
--- a/TTS/encoder/configs/base_encoder_config.py
+++ b/TTS/encoder/configs/base_encoder_config.py
@ -33,10 +33,7 @@ class BaseEncoderConfig(BaseTrainingConfig):
    grad_clip: float = 3.0
    lr: float = 0.0001
    optimizer: str = "radam"
-    optimizer_params: Dict = field(default_factory=lambda: {
-        "betas": [0.9, 0.999],
-        "weight_decay": 0
-    })
+    optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
    lr_decay: bool = False
    warmup_steps: int = 4000

--- a/TTS/encoder/dataset.py
+++ b/TTS/encoder/dataset.py
@ -5,6 +5,7 @@ from torch.utils.data import Dataset

 from TTS.encoder.utils.generic_utils import AugmentWAV

+
 class EncoderDataset(Dataset):
    def __init__(
        self,
@ -57,7 +58,6 @@ class EncoderDataset(Dataset):
            print(f" | > Num Classes: {len(self.classes)}")
            print(f" | > Classes: {self.classes}")

-
    def load_wav(self, filename):
        audio = self.ap.load_wav(filename, sr=self.ap.sample_rate)
        return audio
@ -75,9 +75,7 @@ class EncoderDataset(Dataset):
                ]

        # skip classes with number of samples >= self.num_utter_per_class
-        class_to_utters = {
-            k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class
-        }
+        class_to_utters = {k: v for (k, v) in class_to_utters.items() if len(v) >= self.num_utter_per_class}

        classes = list(class_to_utters.keys())
        classes.sort()
@ -105,11 +103,11 @@ class EncoderDataset(Dataset):

    def get_class_list(self):
        return self.classes
+
    def set_classes(self, classes):
        self.classes = classes
        self.classname_to_classid = {key: i for i, key in enumerate(self.classes)}

-
    def get_map_classid_to_classname(self):
        return dict((c_id, c_n) for c_n, c_id in self.classname_to_classid.items())

--- a/TTS/encoder/losses.py
+++ b/TTS/encoder/losses.py
@ -195,6 +195,7 @@ class SoftmaxLoss(nn.Module):
        class_id = torch.argmax(activations)
        return class_id

+
 class SoftmaxAngleProtoLoss(nn.Module):
    """
    Implementation of the Softmax AnglePrototypical loss as defined in https://arxiv.org/abs/2009.14153
--- a/TTS/encoder/models/base_encoder.py
+++ b/TTS/encoder/models/base_encoder.py
@ -1,12 +1,13 @@
+import numpy as np
 import torch
 import torchaudio
-import numpy as np
+from coqpit import Coqpit
 from torch import nn

-from TTS.utils.io import load_fsspec
 from TTS.encoder.losses import AngleProtoLoss, GE2ELoss, SoftmaxAngleProtoLoss
 from TTS.utils.generic_utils import set_init_dict
-from coqpit import Coqpit
+from TTS.utils.io import load_fsspec
+

 class PreEmphasis(nn.Module):
    def __init__(self, coefficient=0.97):
@ -20,6 +21,7 @@ class PreEmphasis(nn.Module):
        x = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), "reflect")
        return torch.nn.functional.conv1d(x, self.filter).squeeze(1)

+
 class BaseEncoder(nn.Module):
    """Base `encoder` class. Every new `encoder` model must inherit this.

@ -32,31 +34,31 @@ class BaseEncoder(nn.Module):

    def get_torch_mel_spectrogram_class(self, audio_config):
        return torch.nn.Sequential(
-                PreEmphasis(audio_config["preemphasis"]),
-                # TorchSTFT(
-                #     n_fft=audio_config["fft_size"],
-                #     hop_length=audio_config["hop_length"],
-                #     win_length=audio_config["win_length"],
-                #     sample_rate=audio_config["sample_rate"],
-                #     window="hamming_window",
-                #     mel_fmin=0.0,
-                #     mel_fmax=None,
-                #     use_htk=True,
-                #     do_amp_to_db=False,
-                #     n_mels=audio_config["num_mels"],
-                #     power=2.0,
-                #     use_mel=True,
-                #     mel_norm=None,
-                # )
-                torchaudio.transforms.MelSpectrogram(
-                    sample_rate=audio_config["sample_rate"],
-                    n_fft=audio_config["fft_size"],
-                    win_length=audio_config["win_length"],
-                    hop_length=audio_config["hop_length"],
-                    window_fn=torch.hamming_window,
-                    n_mels=audio_config["num_mels"],
-                )
-            )
+            PreEmphasis(audio_config["preemphasis"]),
+            # TorchSTFT(
+            #     n_fft=audio_config["fft_size"],
+            #     hop_length=audio_config["hop_length"],
+            #     win_length=audio_config["win_length"],
+            #     sample_rate=audio_config["sample_rate"],
+            #     window="hamming_window",
+            #     mel_fmin=0.0,
+            #     mel_fmax=None,
+            #     use_htk=True,
+            #     do_amp_to_db=False,
+            #     n_mels=audio_config["num_mels"],
+            #     power=2.0,
+            #     use_mel=True,
+            #     mel_norm=None,
+            # )
+            torchaudio.transforms.MelSpectrogram(
+                sample_rate=audio_config["sample_rate"],
+                n_fft=audio_config["fft_size"],
+                win_length=audio_config["win_length"],
+                hop_length=audio_config["hop_length"],
+                window_fn=torch.hamming_window,
+                n_mels=audio_config["num_mels"],
+            ),
+        )

    @torch.no_grad()
    def inference(self, x, l2_norm=True):
@ -104,7 +106,9 @@ class BaseEncoder(nn.Module):
            raise Exception("The %s  not is a loss supported" % c.loss)
        return criterion

-    def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None):
+    def load_checkpoint(
+        self, config: Coqpit, checkpoint_path: str, eval: bool = False, use_cuda: bool = False, criterion=None
+    ):
        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
        try:
            self.load_state_dict(state["model"])
@ -127,7 +131,12 @@ class BaseEncoder(nn.Module):
                print(" > Criterion load ignored because of:", error)

        # instance and load the criterion for the encoder classifier in inference time
-        if eval and criterion is None and "criterion" in state and getattr(config, 'map_classid_to_classname', None) is not None:
+        if (
+            eval
+            and criterion is None
+            and "criterion" in state
+            and getattr(config, "map_classid_to_classname", None) is not None
+        ):
            criterion = self.get_criterion(config, len(config.map_classid_to_classname))
            criterion.load_state_dict(state["criterion"])

--- a/TTS/encoder/models/resnet.py
+++ b/TTS/encoder/models/resnet.py
@ -4,6 +4,7 @@ from torch import nn
 # from TTS.utils.audio import TorchSTFT
 from TTS.encoder.models.base_encoder import BaseEncoder

+
 class SELayer(nn.Module):
    def __init__(self, channel, reduction=8):
        super(SELayer, self).__init__()
--- a/TTS/encoder/utils/generic_utils.py
+++ b/TTS/encoder/utils/generic_utils.py
@ -147,7 +147,7 @@ def setup_encoder_model(config: "Coqpit"):


 def save_checkpoint(model, optimizer, criterion, model_loss, out_path, current_step, epoch):
-    checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
+    checkpoint_path = "checkpoint_{}.pth".format(current_step)
    checkpoint_path = os.path.join(out_path, checkpoint_path)
    print(" | | > Checkpoint saving : {}".format(checkpoint_path))

@ -177,7 +177,7 @@ def save_best_model(model, optimizer, criterion, model_loss, best_loss, out_path
            "date": datetime.date.today().strftime("%B %d, %Y"),
        }
        best_loss = model_loss
-        bestmodel_path = "best_model.pth.tar"
+        bestmodel_path = "best_model.pth"
        bestmodel_path = os.path.join(out_path, bestmodel_path)
        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
        save_fsspec(state, bestmodel_path)
--- a/TTS/encoder/utils/io.py
+++ b/TTS/encoder/utils/io.py
@ -5,7 +5,7 @@ from TTS.utils.io import save_fsspec


 def save_checkpoint(model, optimizer, model_loss, out_path, current_step):
-    checkpoint_path = "checkpoint_{}.pth.tar".format(current_step)
+    checkpoint_path = "checkpoint_{}.pth".format(current_step)
    checkpoint_path = os.path.join(out_path, checkpoint_path)
    print(" | | > Checkpoint saving : {}".format(checkpoint_path))

@ -31,7 +31,7 @@ def save_best_model(model, optimizer, model_loss, best_loss, out_path, current_s
            "date": datetime.date.today().strftime("%B %d, %Y"),
        }
        best_loss = model_loss
-        bestmodel_path = "best_model.pth.tar"
+        bestmodel_path = "best_model.pth"
        bestmodel_path = os.path.join(out_path, bestmodel_path)
        print("\n > BEST MODEL ({0:.5f}) : {1:}".format(model_loss, bestmodel_path))
        save_fsspec(state, bestmodel_path)
--- a/TTS/encoder/utils/samplers.py
+++ b/TTS/encoder/utils/samplers.py
@ -1,4 +1,5 @@
 import random
+
 from torch.utils.data.sampler import Sampler, SubsetRandomSampler


@ -34,10 +35,21 @@ class PerfectBatchSampler(Sampler):
        drop_last (bool): if True, drops last incomplete batch.
    """

-    def __init__(self, dataset_items, classes, batch_size, num_classes_in_batch, num_gpus=1, shuffle=True, drop_last=False, label_key="class_name"):
+    def __init__(
+        self,
+        dataset_items,
+        classes,
+        batch_size,
+        num_classes_in_batch,
+        num_gpus=1,
+        shuffle=True,
+        drop_last=False,
+        label_key="class_name",
+    ):
        super().__init__(dataset_items)
-        assert batch_size % (num_classes_in_batch * num_gpus) == 0, (
-            'Batch size must be divisible by number of classes times the number of data parallel devices (if enabled).')
+        assert (
+            batch_size % (num_classes_in_batch * num_gpus) == 0
+        ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."

        label_indices = {}
        for idx, item in enumerate(dataset_items):
@ -93,7 +105,7 @@ class PerfectBatchSampler(Sampler):
                if groups % self._dp_devices == 0:
                    yield batch
                else:
-                    batch = batch[:(groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
+                    batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
                    if len(batch) > 0:
                        yield batch

--- a/TTS/model.py
+++ b/TTS/model.py
@ -1,46 +1,34 @@
-from abc import ABC, abstractmethod
-from typing import Dict, List, Tuple
+from abc import abstractmethod
+from typing import Dict

 import torch
 from coqpit import Coqpit
-from torch import nn
+from trainer import TrainerModel

 # pylint: skip-file


-class BaseTrainerModel(ABC, nn.Module):
-    """Abstract 🐸TTS class. Every new 🐸TTS model must inherit this."""
+class BaseTrainerModel(TrainerModel):
+    """BaseTrainerModel model expanding TrainerModel with required functions by 🐸TTS.
+
+    Every new 🐸TTS model must inherit it.
+    """

    @staticmethod
    @abstractmethod
    def init_from_config(config: Coqpit):
-        """Init the model from given config.
+        """Init the model and all its attributes from the given config.

        Override this depending on your model.
        """
        ...

-    @abstractmethod
-    def forward(self, input: torch.Tensor, *args, aux_input={}, **kwargs) -> Dict:
-        """Forward ... for the model mainly used in training.
-
-        You can be flexible here and use different number of arguments and argument names since it is intended to be
-        used by `train_step()` without exposing it out of the model.
-
-        Args:
-            input (torch.Tensor): Input tensor.
-            aux_input (Dict): Auxiliary model inputs like embeddings, durations or any other sorts of inputs.
-
-        Returns:
-            Dict: Model outputs. Main model output must be named as "model_outputs".
-        """
-        outputs_dict = {"model_outputs": None}
-        ...
-        return outputs_dict
-
    @abstractmethod
    def inference(self, input: torch.Tensor, aux_input={}) -> Dict:
-        """Forward ... for inference.
+        """Forward pass for inference.
+
+        It must return a dictionary with the main model output and all the auxiliary outputs. The key ```model_outputs```
+        is considered to be the main output and you can add any other auxiliary outputs as you want.

        We don't use `*kwargs` since it is problematic with the TorchScript API.

@ -55,78 +43,9 @@ class BaseTrainerModel(ABC, nn.Module):
        ...
        return outputs_dict

-    def format_batch(self, batch: Dict) -> Dict:
-        """Format batch returned by the data loader before sending it to the model.
-
-        If not implemented, model uses the batch as is.
-        Can be used for data augmentation, feature ectraction, etc.
-        """
-        return batch
-
-    def format_batch_on_device(self, batch: Dict) -> Dict:
-        """Format batch on device before sending it to the model.
-
-        If not implemented, model uses the batch as is.
-        Can be used for data augmentation, feature ectraction, etc.
-        """
-        return batch
-
-    @abstractmethod
-    def train_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
-        """Perform a single training step. Run the model forward ... and compute losses.
-
-        Args:
-            batch (Dict): Input tensors.
-            criterion (nn.Module): Loss layer designed for the model.
-
-        Returns:
-            Tuple[Dict, Dict]: Model ouputs and computed losses.
-        """
-        outputs_dict = {}
-        loss_dict = {}  # this returns from the criterion
-        ...
-        return outputs_dict, loss_dict
-
-    def train_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None:
-        """Create visualizations and waveform examples for training.
-
-        For example, here you can plot spectrograms and generate sample sample waveforms from these spectrograms to
-        be projected onto Tensorboard.
-
-        Args:
-            ap (AudioProcessor): audio processor used at training.
-            batch (Dict): Model inputs used at the previous training step.
-            outputs (Dict): Model outputs generated at the previoud training step.
-
-        Returns:
-            Tuple[Dict, np.ndarray]: training plots and output waveform.
-        """
-        ...
-
-    @abstractmethod
-    def eval_step(self, batch: Dict, criterion: nn.Module) -> Tuple[Dict, Dict]:
-        """Perform a single evaluation step. Run the model forward ... and compute losses. In most cases, you can
-        call `train_step()` with no changes.
-
-        Args:
-            batch (Dict): Input tensors.
-            criterion (nn.Module): Loss layer designed for the model.
-
-        Returns:
-            Tuple[Dict, Dict]: Model ouputs and computed losses.
-        """
-        outputs_dict = {}
-        loss_dict = {}  # this returns from the criterion
-        ...
-        return outputs_dict, loss_dict
-
-    def eval_log(self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int) -> None:
-        """The same as `train_log()`"""
-        ...
-
    @abstractmethod
    def load_checkpoint(self, config: Coqpit, checkpoint_path: str, eval: bool = False, strict: bool = True) -> None:
-        """Load a checkpoint and get ready for training or inference.
+        """Load a model checkpoint gile and get ready for training or inference.

        Args:
            config (Coqpit): Model configuration.
@ -135,36 +54,3 @@ class BaseTrainerModel(ABC, nn.Module):
            strcit (bool, optional): Match all checkpoint keys to model's keys. Defaults to True.
        """
        ...
-
-    @staticmethod
-    @abstractmethod
-    def init_from_config(config: Coqpit, samples: List[Dict] = None, verbose=False) -> "BaseTrainerModel":
-        """Init the model from given config.
-
-        Override this depending on your model.
-        """
-        ...
-
-    @abstractmethod
-    def get_data_loader(
-        self, config: Coqpit, assets: Dict, is_eval: True, data_items: List, verbose: bool, num_gpus: int
-    ):
-        ...
-
-    # def get_optimizer(self) -> Union["Optimizer", List["Optimizer"]]:
-    #     """Setup an return optimizer or optimizers."""
-    #     ...
-
-    # def get_lr(self) -> Union[float, List[float]]:
-    #     """Return learning rate(s).
-
-    #     Returns:
-    #         Union[float, List[float]]: Model's initial learning rates.
-    #     """
-    #     ...
-
-    # def get_scheduler(self, optimizer: torch.optim.Optimizer):
-    #     ...
-
-    # def get_criterion(self):
-    #     ...
--- a/TTS/server/README.md
+++ b/TTS/server/README.md
@ -21,4 +21,4 @@ Run the server with the official models on a GPU.
 ```CUDA_VISIBLE_DEVICES="0" python TTS/server/server.py  --model_name tts_models/en/ljspeech/tacotron2-DCA --vocoder_name vocoder_models/en/ljspeech/multiband-melgan --use_cuda True```

 Run the server with a custom models.
-```python TTS/server/server.py  --tts_checkpoint /path/to/tts/model.pth.tar --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth.tar --vocoder_config /path/to/vocoder/config.json```
+```python TTS/server/server.py  --tts_checkpoint /path/to/tts/model.pth --tts_config /path/to/tts/config.json --vocoder_checkpoint /path/to/vocoder/model.pth --vocoder_config /path/to/vocoder/config.json```
--- a/TTS/server/conf.json
+++ b/TTS/server/conf.json
@ -1,6 +1,6 @@
 {
    "tts_path":"/media/erogol/data_ssd/Models/libri_tts/5049/",  // tts model root folder
-    "tts_file":"best_model.pth.tar",     // tts checkpoint file
+    "tts_file":"best_model.pth",     // tts checkpoint file
    "tts_config":"config.json",     // tts config.json file
    "tts_speakers": null,           // json file listing speaker ids. null if no speaker embedding.
    "vocoder_config":null,
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@ -246,7 +246,7 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None):
                        continue
                items.append({"text": text, "audio_file": wav_file, "speaker_name": f"LTTS_{speaker_name}"})
    for item in items:
-        assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
+        assert os.path.exists(item["audio_file"]), f" [!] wav files don't exist - {item['audio_file']}"
    return items


--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@ -7,15 +7,15 @@ import torch.distributed as dist
 from coqpit import Coqpit
 from torch import nn
 from torch.utils.data import DataLoader
+from torch.utils.data.sampler import WeightedRandomSampler
 from trainer.torch import DistributedSampler, DistributedSamplerWrapper

 from TTS.model import BaseTrainerModel
 from TTS.tts.datasets.dataset import TTSDataset
 from TTS.tts.utils.languages import LanguageManager, get_language_balancer_weights
-from TTS.tts.utils.speakers import SpeakerManager, get_speaker_manager, get_speaker_balancer_weights
+from TTS.tts.utils.speakers import SpeakerManager, get_speaker_balancer_weights, get_speaker_manager
 from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.visual import plot_alignment, plot_spectrogram
-from torch.utils.data.sampler import WeightedRandomSampler

 # pylint: skip-file

@ -258,7 +258,7 @@ class BaseTTS(BaseTrainerModel):
        # sampler for DDP
        if sampler is None:
            sampler = DistributedSampler(dataset) if num_gpus > 1 else None
-        else: # If a sampler is already defined use this sampler and DDP sampler together
+        else:  # If a sampler is already defined use this sampler and DDP sampler together
            sampler = DistributedSamplerWrapper(sampler) if num_gpus > 1 else sampler

        return sampler
@ -279,9 +279,7 @@ class BaseTTS(BaseTrainerModel):
            # setup multi-speaker attributes
            if hasattr(self, "speaker_manager") and self.speaker_manager is not None:
                if hasattr(config, "model_args"):
-                    speaker_id_mapping = (
-                        self.speaker_manager.ids if config.model_args.use_speaker_embedding else None
-                    )
+                    speaker_id_mapping = self.speaker_manager.ids if config.model_args.use_speaker_embedding else None
                    d_vector_mapping = self.speaker_manager.embeddings if config.model_args.use_d_vector_file else None
                    config.use_d_vector_file = config.model_args.use_d_vector_file
                else:
@ -293,9 +291,7 @@ class BaseTTS(BaseTrainerModel):

            # setup multi-lingual attributes
            if hasattr(self, "language_manager") and self.language_manager is not None:
-                language_id_mapping = (
-                    self.language_manager.ids if self.args.use_language_embedding else None
-                )
+                language_id_mapping = self.language_manager.ids if self.args.use_language_embedding else None
            else:
                language_id_mapping = None

--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -676,6 +676,7 @@ class Vits(BaseTTS):
                raise RuntimeError(
                    " [!] To use the speaker consistency loss (SCL) you need to specify encoder_model_path and encoder_config_path !!"
                )
+
            # load encoder
            self.speaker_manager.init_encoder(self.args.encoder_model_path, self.args.encoder_config_path)
            self.speaker_manager.encoder.eval()
@ -1095,7 +1096,9 @@ class Vits(BaseTTS):
        return outputs

    @torch.no_grad()
-    def inference_voice_conversion(self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None):
+    def inference_voice_conversion(
+        self, reference_wav, speaker_id=None, d_vector=None, reference_speaker_id=None, reference_d_vector=None
+    ):
        """Inference for voice conversion

        Args:
@ -1106,7 +1109,13 @@ class Vits(BaseTTS):
            reference_d_vector (Tensor): d_vector embedding of the reference_wav speaker. Tensor of shape `[B, C]`
        """
        # compute spectrograms
-        y = wav_to_spec(reference_wav, self.config.audio.fft_size, self.config.audio.hop_length, self.config.audio.win_length, center=False).transpose(1, 2)
+        y = wav_to_spec(
+            reference_wav,
+            self.config.audio.fft_size,
+            self.config.audio.hop_length,
+            self.config.audio.win_length,
+            center=False,
+        ).transpose(1, 2)
        y_lengths = torch.tensor([y.size(-1)]).to(y.device)
        speaker_cond_src = reference_speaker_id if reference_speaker_id is not None else reference_d_vector
        speaker_cond_tgt = speaker_id if speaker_id is not None else d_vector
@ -1346,6 +1355,7 @@ class Vits(BaseTTS):
                else:
                    emotion_id = self.emotion_manager.ids[emotion_name]

+
        return {
            "text": text,
            "speaker_id": speaker_id,
@ -1419,12 +1429,8 @@ class Vits(BaseTTS):
            d_vectors = torch.FloatTensor(d_vectors)

        # get language ids from language names
-        if (
-            self.language_manager is not None
-            and self.language_manager.ids
-            and self.args.use_language_embedding
-        ):
-            language_ids = [self.language_manager.ids[ln] for ln in batch["f"]]
+        if self.language_manager is not None and self.language_manager.ids and self.args.use_language_embedding:
+            language_ids = [self.language_manager.ids[ln] for ln in batch["language_names"]]

        if language_ids is not None:
            language_ids = torch.LongTensor(language_ids)
--- a/TTS/tts/utils/languages.py
+++ b/TTS/tts/utils/languages.py
@ -1,5 +1,6 @@
 import os
-from typing import Dict, List, Any
+from typing import Any, Dict, List
+

 import fsspec
 import numpy as np
@ -9,6 +10,7 @@ from coqpit import Coqpit
 from TTS.config import check_config_and_model_args
 from TTS.tts.utils.managers import BaseIDManager

+
 class LanguageManager(BaseIDManager):
    """Manage the languages for multi-lingual 🐸TTS models. Load a datafile and parse the information
    in a way that can be queried by language.
--- a/TTS/tts/utils/managers.py
+++ b/TTS/tts/utils/managers.py
@ -12,13 +12,11 @@ from TTS.utils.audio import AudioProcessor


 class BaseIDManager:
-    """ Base `ID` Manager class. Every new `ID` manager must inherit this.
+    """Base `ID` Manager class. Every new `ID` manager must inherit this.
    It defines common `ID` manager specific functions.
    """
-    def __init__(
-        self,
-        id_file_path: str = ""
-    ):
+
+    def __init__(self, id_file_path: str = ""):
        self.ids = {}

        if id_file_path:
@ -85,10 +83,12 @@ class BaseIDManager:
        ids = {name: i for i, name in enumerate(classes)}
        return ids

+
 class EmbeddingManager(BaseIDManager):
-    """ Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
+    """Base `Embedding` Manager class. Every new `Embedding` manager must inherit this.
    It defines common `Embedding` manager specific functions.
    """
+
    def __init__(
        self,
        embedding_file_path: str = "",
@ -225,7 +225,10 @@ class EmbeddingManager(BaseIDManager):
        """
        self.encoder_config = load_config(config_path)
        self.encoder = setup_encoder_model(self.encoder_config)
-        self.encoder_criterion = self.encoder.load_checkpoint(self.encoder_config, model_path, eval=True, use_cuda=self.use_cuda)
+
+        self.encoder_criterion = self.encoder.load_checkpoint(
+            self.encoder_config, model_path, eval=True, use_cuda=self.use_cuda
+        )
        self.encoder_ap = AudioProcessor(**self.encoder_config.audio)

    def compute_embedding_from_clip(self, wav_file: Union[str, List[str]]) -> list:
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@ -10,6 +10,7 @@ from coqpit import Coqpit
 from TTS.config import get_from_config_or_model_args_with_default
 from TTS.tts.utils.managers import EmbeddingManager

+
 class SpeakerManager(EmbeddingManager):
    """Manage the speakers for multi-speaker 🐸TTS models. Load a datafile and parse the information
    in a way that can be queried by speaker or clip.
@ -67,6 +68,7 @@ class SpeakerManager(EmbeddingManager):
            use_cuda=use_cuda
            )

+
        if data_items:
            self.set_ids_from_data(data_items, parse_key="speaker_name")

--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -218,6 +218,7 @@ def synthesis(
    }
    return return_dict

+
 def transfer_voice(
    model,
    CONFIG,
@ -281,12 +282,7 @@ def transfer_voice(
        _func = model.module.inference_voice_conversion
    else:
        _func = model.inference_voice_conversion
-    model_outputs = _func(
-        reference_wav,
-        speaker_id,
-        d_vector,
-        reference_speaker_id,
-        reference_d_vector)
+    model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector)

    # convert outputs to numpy
    # plot results
--- a/TTS/tts/utils/text/phonemizers/init.py
+++ b/TTS/tts/utils/text/phonemizers/init.py
@ -12,16 +12,9 @@ GRUUT_LANGS = list(Gruut.supported_languages())


 # Dict setting default phonemizers for each language
-DEF_LANG_TO_PHONEMIZER = {
-    "ja-jp": JA_JP_Phonemizer.name(),
-    "zh-cn": ZH_CN_Phonemizer.name(),
-}
-
-
 # Add Gruut languages
 _ = [Gruut.name()] * len(GRUUT_LANGS)
-_new_dict = dict(list(zip(GRUUT_LANGS, _)))
-DEF_LANG_TO_PHONEMIZER.update(_new_dict)
+DEF_LANG_TO_PHONEMIZER = dict(list(zip(GRUUT_LANGS, _)))


 # Add ESpeak languages and override any existing ones
@ -29,7 +22,10 @@ _ = [ESpeak.name()] * len(ESPEAK_LANGS)
 _new_dict = dict(list(zip(list(ESPEAK_LANGS), _)))
 DEF_LANG_TO_PHONEMIZER.update(_new_dict)

+# Force default for some languages
 DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"]
+DEF_LANG_TO_PHONEMIZER["ja-jp"] = JA_JP_Phonemizer.name()
+DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name()


 def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer:
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio.py
@ -371,7 +371,9 @@ class AudioProcessor(object):
            self.hop_length = hop_length
            self.win_length = win_length
        assert min_level_db != 0.0, " [!] min_level_db is 0"
-        assert self.win_length <= self.fft_size, " [!] win_length cannot be larger than fft_size"
+        assert (
+            self.win_length <= self.fft_size
+        ), f" [!] win_length cannot be larger than fft_size - {self.win_length} vs {self.fft_size}"
        members = vars(self)
        if verbose:
            print(" > Setting up Audio Processor...")
--- a/TTS/utils/generic_utils.py
+++ b/TTS/utils/generic_utils.py
@ -67,7 +67,7 @@ def get_experiment_folder_path(root_path, model_name):
 def remove_experiment_folder(experiment_path):
    """Check folder if there is a checkpoint, otherwise remove the folder"""
    fs = fsspec.get_mapper(experiment_path).fs
-    checkpoint_files = fs.glob(experiment_path + "/*.pth.tar")
+    checkpoint_files = fs.glob(experiment_path + "/*.pth")
    if not checkpoint_files:
        if fs.exists(experiment_path):
            fs.rm(experiment_path, recursive=True)
--- a/TTS/utils/io.py
+++ b/TTS/utils/io.py
@ -140,7 +140,7 @@ def save_checkpoint(
    output_folder,
    **kwargs,
 ):
-    file_name = "checkpoint_{}.pth.tar".format(current_step)
+    file_name = "checkpoint_{}.pth".format(current_step)
    checkpoint_path = os.path.join(output_folder, file_name)
    print("\n > CHECKPOINT : {}".format(checkpoint_path))
    save_model(
@ -170,7 +170,7 @@ def save_best_model(
    **kwargs,
 ):
    if current_loss < best_loss:
-        best_model_name = f"best_model_{current_step}.pth.tar"
+        best_model_name = f"best_model_{current_step}.pth"
        checkpoint_path = os.path.join(out_path, best_model_name)
        print(" > BEST MODEL : {}".format(checkpoint_path))
        save_model(
@ -187,12 +187,12 @@ def save_best_model(
        fs = fsspec.get_mapper(out_path).fs
        # only delete previous if current is saved successfully
        if not keep_all_best or (current_step < keep_after):
-            model_names = fs.glob(os.path.join(out_path, "best_model*.pth.tar"))
+            model_names = fs.glob(os.path.join(out_path, "best_model*.pth"))
            for model_name in model_names:
                if os.path.basename(model_name) != best_model_name:
                    fs.rm(model_name)
        # create a shortcut which always points to the currently best model
-        shortcut_name = "best_model.pth.tar"
+        shortcut_name = "best_model.pth"
        shortcut_path = os.path.join(out_path, shortcut_name)
        fs.copy(checkpoint_path, shortcut_path)
        best_loss = current_loss
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -4,6 +4,7 @@ import os
 import zipfile
 from pathlib import Path
 from shutil import copyfile, rmtree
+from typing import Tuple

 import requests

@ -114,7 +115,7 @@ class ModelManager(object):
            e.g. 'tts_model/en/ljspeech/tacotron'

        Every model must have the following files:
-            - *.pth.tar : pytorch model checkpoint file.
+            - *.pth : pytorch model checkpoint file.
            - config.json : model config file.
            - scale_stats.npy (if exist): scale values for preprocessing.

@ -127,9 +128,6 @@ class ModelManager(object):
        model_item = self.models_dict[model_type][lang][dataset][model]
        # set the model specific output path
        output_path = os.path.join(self.output_prefix, model_full_name)
-        output_model_path = os.path.join(output_path, "model_file.pth.tar")
-        output_config_path = os.path.join(output_path, "config.json")
-
        if os.path.exists(output_path):
            print(f" > {model_name} is already downloaded.")
        else:
@ -137,10 +135,51 @@ class ModelManager(object):
            print(f" > Downloading model to {output_path}")
            # download from github release
            self._download_zip_file(model_item["github_rls_url"], output_path)
+        # find downloaded files
+        output_model_path, output_config_path = self._find_files(output_path)
        # update paths in the config.json
        self._update_paths(output_path, output_config_path)
        return output_model_path, output_config_path, model_item

+    @staticmethod
+    def _find_files(output_path: str) -> Tuple[str, str]:
+        """Find the model and config files in the output path
+
+        Args:
+            output_path (str): path to the model files
+
+        Returns:
+            Tuple[str, str]: path to the model file and config file
+        """
+        model_file = None
+        config_file = None
+        for file_name in os.listdir(output_path):
+            if file_name in ["model_file.pth", "model_file.pth.tar", "model.pth"]:
+                model_file = os.path.join(output_path, file_name)
+            elif file_name == "config.json":
+                config_file = os.path.join(output_path, file_name)
+        if model_file is None:
+            raise ValueError(" [!] Model file not found in the output path")
+        if config_file is None:
+            raise ValueError(" [!] Config file not found in the output path")
+        return model_file, config_file
+
+    @staticmethod
+    def _find_speaker_encoder(output_path: str) -> str:
+        """Find the speaker encoder file in the output path
+
+        Args:
+            output_path (str): path to the model files
+
+        Returns:
+            str: path to the speaker encoder file
+        """
+        speaker_encoder_file = None
+        for file_name in os.listdir(output_path):
+            if file_name in ["model_se.pth", "model_se.pth.tar"]:
+                speaker_encoder_file = os.path.join(output_path, file_name)
+        return speaker_encoder_file
+
    def _update_paths(self, output_path: str, config_path: str) -> None:
        """Update paths for certain files in config.json after download.

@ -174,7 +213,7 @@ class ModelManager(object):
    @staticmethod
    def _update_path(field_name, new_path, config_path):
        """Update the path in the model config.json for the current environment after download"""
-        if os.path.exists(new_path):
+        if new_path and os.path.exists(new_path):
            config = load_config(config_path)
            field_names = field_name.split(".")
            if len(field_names) > 1:
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@ -214,8 +214,8 @@ class Synthesizer(object):

        if not text and not reference_wav:
            raise ValueError(
-                    "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
-                )
+                "You need to define either `text` (for sythesis) or a `reference_wav` (for voice conversion) to use the Coqui TTS API."
+            )

        if text:
            sens = self.split_into_sentences(text)
@ -228,8 +228,10 @@ class Synthesizer(object):
        if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "ids"):
            if speaker_name and isinstance(speaker_name, str):
                if self.tts_config.use_d_vector_file:
-                    # get the average speaker embedding from the saved embeddings.
-                    speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(speaker_name, num_samples=None, randomize=False)
+                    # get the average speaker embedding from the saved d_vectors.
+                    speaker_embedding = self.tts_model.speaker_manager.get_mean_embedding(
+                        speaker_name, num_samples=None, randomize=False
+                    )
                    speaker_embedding = np.array(speaker_embedding)[None, :]  # [1 x embedding_dim]
                else:
                    # get speaker idx from the speaker name
@ -354,26 +356,32 @@ class Synthesizer(object):
            if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "speaker_ids"):
                if reference_speaker_name and isinstance(reference_speaker_name, str):
                    if self.tts_config.use_d_vector_file:
-                        # get the speaker embedding from the saved embeddings.
-                        reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name(reference_speaker_name)[0]
-                        reference_speaker_embedding = np.array(reference_speaker_embedding)[None, :]  # [1 x embedding_dim]
+                        # get the speaker embedding from the saved d_vectors.
+                        reference_speaker_embedding = self.tts_model.speaker_manager.get_embeddings_by_name(
+                            reference_speaker_name
+                        )[0]
+                        reference_speaker_embedding = np.array(reference_speaker_embedding)[
+                            None, :
+                        ]  # [1 x embedding_dim]
                    else:
                        # get speaker idx from the speaker name
                        reference_speaker_id = self.tts_model.speaker_manager.ids[reference_speaker_name]
                else:
-                    reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(reference_wav)
+                    reference_speaker_embedding = self.tts_model.speaker_manager.compute_embedding_from_clip(
+                        reference_wav
+                    )

            outputs = transfer_voice(
-                    model=self.tts_model,
-                    CONFIG=self.tts_config,
-                    use_cuda=self.use_cuda,
-                    reference_wav=reference_wav,
-                    speaker_id=speaker_id,
-                    d_vector=speaker_embedding,
-                    use_griffin_lim=use_gl,
-                    reference_speaker_id=reference_speaker_id,
-                    reference_d_vector=reference_speaker_embedding
-                )
+                model=self.tts_model,
+                CONFIG=self.tts_config,
+                use_cuda=self.use_cuda,
+                reference_wav=reference_wav,
+                speaker_id=speaker_id,
+                d_vector=speaker_embedding,
+                use_griffin_lim=use_gl,
+                reference_speaker_id=reference_speaker_id,
+                reference_d_vector=reference_speaker_embedding,
+            )
            waveform = outputs
            if not use_gl:
                mel_postnet_spec = outputs[0].detach().cpu().numpy()
--- a/TTS/vocoder/README.md
+++ b/TTS/vocoder/README.md
@ -29,7 +29,7 @@ You can continue a previous training run by the following command.

 You can fine-tune a pre-trained model by the following command.

-```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth.tar```
+```CUDA_VISIBLE_DEVICES='0' python tts/bin/train_vocoder.py --restore_path path/to/your/model.pth```

 Restoring a model starts a new training in a different folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same directory where the previous training run left off.

--- a/docs/source/finetuning.md
+++ b/docs/source/finetuning.md
@ -93,13 +93,13 @@ them and fine-tune it for your own dataset. This will help you in two main ways:

    ```bash
    CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
-        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
+        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
    ```

    ```bash
    CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_tts.py \
        --config_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/config.json \
-        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
+        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
    ```

    As stated above, you can also use command-line arguments to change the model configuration.
@ -107,7 +107,7 @@ them and fine-tune it for your own dataset. This will help you in two main ways:

    ```bash
    CUDA_VISIBLE_DEVICES="0" python recipes/ljspeech/glow_tts/train_glowtts.py \
-        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth.tar
+        --restore_path  /home/ubuntu/.local/share/tts/tts_models--en--ljspeech--glow-tts/model_file.pth
        --coqpit.run_name "glow-tts-finetune" \
        --coqpit.lr 0.00001
    ```
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@ -44,7 +44,7 @@ Run your own TTS model (Using Griffin-Lim Vocoder)

 ```bash
 tts --text "Text for TTS" \
-    --model_path path/to/model.pth.tar \
+    --model_path path/to/model.pth \
    --config_path path/to/config.json \
    --out_path folder/to/save/output.wav
 ```
@ -54,9 +54,9 @@ Run your own TTS and Vocoder models
 ```bash
 tts --text "Text for TTS" \
    --config_path path/to/config.json \
-    --model_path path/to/model.pth.tar \
+    --model_path path/to/model.pth \
    --out_path folder/to/save/output.wav \
-    --vocoder_path path/to/vocoder.pth.tar \
+    --vocoder_path path/to/vocoder.pth \
    --vocoder_config_path path/to/vocoder_config.json
 ```

--- a/docs/source/training_a_model.md
+++ b/docs/source/training_a_model.md
@ -33,7 +33,7 @@
    If you like to run a multi-gpu training using DDP back-end,

    ```bash
-    $ CUDA_VISIBLE_DEVICES="0, 1, 2" python TTS/bin/distribute.py --script <path_to_your_script>/train_glowtts.py
+    $ CUDA_VISIBLE_DEVICES="0, 1, 2" python -m trainer.distribute --script <path_to_your_script>/train_glowtts.py
    ```

    The example above runs a multi-gpu training using GPUs `0, 1, 2`.
@ -122,7 +122,7 @@

    ```bash
    $ tts --text "Text for TTS" \
-          --model_path path/to/checkpoint_x.pth.tar \
+          --model_path path/to/checkpoint_x.pth \
          --config_path path/to/config.json \
          --out_path folder/to/save/output.wav
    ```
--- a/docs/source/tutorial_for_nervous_beginners.md
+++ b/docs/source/tutorial_for_nervous_beginners.md
@ -50,13 +50,13 @@ A breakdown of a simple script that trains a GlowTTS model on the LJspeech datas
    - Fine-tune a model.

        ```bash
-        CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth.tar
+        CUDA_VISIBLE_DEVICES=0 python train.py --restore_path path/to/model/checkpoint.pth
        ```

    - Run multi-gpu training.

        ```bash
-        CUDA_VISIBLE_DEVICES=0,1,2 python TTS/bin/distribute.py --script train.py
+        CUDA_VISIBLE_DEVICES=0,1,2 python -m trainer.distribute --script train.py
        ```

 ### CLI Way
--- a/notebooks/ExtractTTSpectrogram.ipynb
+++ b/notebooks/ExtractTTSpectrogram.ipynb
@ -66,7 +66,7 @@
    "DATASET = \"ljspeech\"\n",
    "METADATA_FILE = \"metadata.csv\"\n",
    "CONFIG_PATH = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/config.json\"\n",
-    "MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth.tar\"\n",
+    "MODEL_FILE = \"/home/ubuntu/.local/share/tts/tts_models--en--ljspeech--tacotron2-DDC_ph/model_file.pth\"\n",
    "BATCH_SIZE = 32\n",
    "\n",
    "QUANTIZED_WAV = False\n",
--- a/notebooks/PlotUmapLibriTTS.ipynb
+++ b/notebooks/PlotUmapLibriTTS.ipynb
@ -66,7 +66,7 @@
   "outputs": [],
   "source": [
    "MODEL_RUN_PATH = \"/media/erogol/data_ssd/Models/libri_tts/speaker_encoder/libritts_360-half-October-31-2019_04+54PM-19d2f5f/\"\n",
-    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth.tar\"\n",
+    "MODEL_PATH = MODEL_RUN_PATH + \"best_model.pth\"\n",
    "CONFIG_PATH = MODEL_RUN_PATH + \"config.json\"\n",
    "\n",
    "# My single speaker locations\n",
--- a/notebooks/TestAttention.ipynb
+++ b/notebooks/TestAttention.ipynb
@ -73,7 +73,7 @@
    "\n",
    "# Set constants\n",
    "ROOT_PATH = '/home/erogol/Models/LJSpeech/ljspeech-May-20-2020_12+29PM-1835628/'\n",
-    "MODEL_PATH = ROOT_PATH + '/best_model.pth.tar'\n",
+    "MODEL_PATH = ROOT_PATH + '/best_model.pth'\n",
    "CONFIG_PATH = ROOT_PATH + '/config.json'\n",
    "OUT_FOLDER = './hard_sentences/'\n",
    "CONFIG = load_config(CONFIG_PATH)\n",
--- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb
+++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb
@ -416,7 +416,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.1"
+   "version": "3.9.5"
  }
 },
 "nbformat": 4,
--- a/notebooks/dataset_analysis/CheckSpectrograms.ipynb
+++ b/notebooks/dataset_analysis/CheckSpectrograms.ipynb
@ -3,6 +3,10 @@
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
@ -12,21 +16,51 @@
    "\n",
    "import IPython.display as ipd\n",
    "import glob"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
-    "config_path = \"/home/erogol/gdrive/Projects/TTS/recipes/ljspeech/align_tts/config_transformer2.json\"\n",
-    "data_path = \"/home/erogol/gdrive/Datasets/LJSpeech-1.1/\"\n",
-    "\n",
-    "file_paths = glob.glob(data_path + \"/**/*.wav\", recursive=True)\n",
-    "CONFIG = load_config(config_path)\n",
+    "from TTS.config.shared_configs import BaseAudioConfig\n",
+    "CONFIG = BaseAudioConfig()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ✍️ Set these values "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_path = \"/root/wav48_silence_trimmed/\"\n",
+    "file_ext = \".flac\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read audio files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file_paths = glob.glob(data_path + f\"/**/*{file_ext}\", recursive=True)\n",
    "\n",
    "# Change this to the index of the desired file listed below\n",
    "sample_file_index = 10\n",
@ -35,44 +69,45 @@
    "\n",
    "print(\"File list, by index:\")\n",
    "dict(enumerate(file_paths))"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {
+    "Collapsed": "false"
+   },
   "source": [
-    "### Setup Audio Processor\n",
+    "## ✍️ Set Audio Processor\n",
    "Play with the AP parameters until you find a good fit with the synthesis speech below.\n",
    "\n",
    "The default values are loaded from your config.json file, so you only need to\n",
    "uncomment and modify values below that you'd like to tune."
-   ],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
    "tune_params={\n",
-    "#  'audio_processor': 'audio',\n",
-    "#  'num_mels': 80,          # In general, you don't need to change this. \n",
-    "#  'fft_size': 1024,        # In general, you don't need to change this.\n",
-    "#  'sample_rate': 22050,    # This must match the sample rate of the dataset.\n",
-    "#  'hop_length': 256,       # In general, you don't need to change this.\n",
-    "#  'win_length': 1024,      # In general, you don't need to change this.\n",
-    "#  'preemphasis': 0.98,     # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
-    "#  'min_level_db': -100,\n",
-    "#  'ref_level_db': 0,       # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
-    "#  'power': 1.5,            # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
-    "#  'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
-    "#  'mel_fmin': 0.0,         # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
-    "#  'mel_fmax': 8000.0,      # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
-    "#  'do_trim_silence': True  # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
+    " 'num_mels': 80,          # In general, you don't need to change this. \n",
+    " 'fft_size': 2400,        # In general, you don't need to change this.\n",
+    " 'frame_length_ms': 50, \n",
+    " 'frame_shift_ms': 12.5,\n",
+    " 'sample_rate': 48000,    # This must match the sample rate of the dataset.\n",
+    " 'hop_length': None,       # In general, you don't need to change this.\n",
+    " 'win_length': 1024,      # In general, you don't need to change this.\n",
+    " 'preemphasis': 0.98,     # In general, 0 gives better voice recovery but makes training harder. If your model does not train, try 0.97 - 0.99.\n",
+    " 'min_level_db': -100,\n",
+    " 'ref_level_db': 0,       # The base DB; increase until all background noise is removed in the spectrogram, then lower until you hear better speech below.\n",
+    " 'power': 1.5,            # Change this value and listen to the synthesized voice. 1.2 - 1.5 are resonable values.\n",
+    " 'griffin_lim_iters': 60, # Quality does not improve for values > 60\n",
+    " 'mel_fmin': 0.0,         # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
+    " 'mel_fmax': 8000.0,      # Adjust this and check mel-spectrogram-based voice synthesis below.\n",
+    " 'do_trim_silence': True  # If you dataset has some silience at the beginning or end, this trims it. Check the AP.load_wav() below,if it causes any difference for the loaded audio file.\n",
    "}\n",
    "\n",
    "# These options have to be forced off in order to avoid errors about the \n",
@ -86,59 +121,57 @@
    "}\n",
    "\n",
    "# Override select parts of loaded config with parameters above\n",
-    "tuned_config = CONFIG.audio.copy()\n",
+    "tuned_config = CONFIG.copy()\n",
    "tuned_config.update(reset)\n",
    "tuned_config.update(tune_params)\n",
    "\n",
    "AP = AudioProcessor(**tuned_config);"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "### Check audio loading "
-   ],
   "metadata": {
    "Collapsed": "false"
-   }
+   },
+   "source": [
+    "### Check audio loading "
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
    "wav = AP.load_wav(SAMPLE_FILE_PATH)\n",
    "ipd.Audio(data=wav, rate=AP.sample_rate) "
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "### Generate Mel-Spectrogram and Re-synthesis with GL"
-   ],
   "metadata": {
    "Collapsed": "false"
-   }
+   },
+   "source": [
+    "### Generate Mel-Spectrogram and Re-synthesis with GL"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "AP.power = 1.5"
-   ],
-   "outputs": [],
-   "metadata": {}
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
   "source": [
    "mel = AP.melspectrogram(wav)\n",
    "print(\"Max:\", mel.max())\n",
@ -148,24 +181,24 @@
    "\n",
    "wav_gen = AP.inv_melspectrogram(mel)\n",
    "ipd.Audio(wav_gen, rate=AP.sample_rate)"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "markdown",
-   "source": [
-    "### Generate Linear-Spectrogram and Re-synthesis with GL"
-   ],
   "metadata": {
    "Collapsed": "false"
-   }
+   },
+   "source": [
+    "### Generate Linear-Spectrogram and Re-synthesis with GL"
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
    "spec = AP.spectrogram(wav)\n",
    "print(\"Max:\", spec.max())\n",
@ -175,26 +208,26 @@
    "\n",
    "wav_gen = AP.inv_spectrogram(spec)\n",
    "ipd.Audio(wav_gen, rate=AP.sample_rate)"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "markdown",
+   "metadata": {
+    "Collapsed": "false"
+   },
   "source": [
    "### Compare values for a certain parameter\n",
    "\n",
    "Optimize your parameters by comparing different values per parameter at a time."
-   ],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
    "from librosa import display\n",
    "from matplotlib import pylab as plt\n",
@ -234,39 +267,39 @@
    "        val = values[idx]\n",
    "        print(\" > {} = {}\".format(attribute, val))\n",
    "        IPython.display.display(IPython.display.Audio(wav_gen, rate=AP.sample_rate))"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
+   "metadata": {
+    "Collapsed": "false"
+   },
+   "outputs": [],
   "source": [
    "compare_values(\"preemphasis\", [0, 0.5, 0.97, 0.98, 0.99])"
-   ],
-   "outputs": [],
-   "metadata": {
-    "Collapsed": "false"
-   }
+   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "source": [
-    "compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
-   ],
-   "outputs": [],
   "metadata": {
    "Collapsed": "false"
-   }
+   },
+   "outputs": [],
+   "source": [
+    "compare_values(\"ref_level_db\", [2, 5, 10, 15, 20, 25, 30, 35, 40, 1000])"
+   ]
  }
 ],
 "metadata": {
+  "interpreter": {
+   "hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
+  },
  "kernelspec": {
-   "name": "python3",
-   "display_name": "Python 3.8.5 64-bit ('torch': conda)"
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
@ -278,12 +311,9 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
-  },
-  "interpreter": {
-   "hash": "27648abe09795c3a768a281b31f7524fcf66a207e733f8ecda3a4e1fd1059fb0"
+   "version": "3.9.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
-}
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,7 +6,7 @@ max-line-length=120

 [tool.black]
 line-length = 120
-target-version = ['py38']
+target-version = ['py39']
 exclude = '''

 (
--- a/recipes/ljspeech/align_tts/train_aligntts.py
+++ b/recipes/ljspeech/align_tts/train_aligntts.py
@ -49,7 +49,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init model
 model = AlignTTS(config, ap, tokenizer)
--- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py
+++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
@ -84,7 +84,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init the model
 model = ForwardTTS(config, ap, tokenizer, speaker_manager=None)
--- a/recipes/ljspeech/fast_speech/train_fast_speech.py
+++ b/recipes/ljspeech/fast_speech/train_fast_speech.py
@ -83,7 +83,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init the model
 model = ForwardTTS(config, ap, tokenizer)
--- a/recipes/ljspeech/glow_tts/train_glowtts.py
+++ b/recipes/ljspeech/glow_tts/train_glowtts.py
@ -60,7 +60,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
--- a/recipes/ljspeech/hifigan/train_hifigan.py
+++ b/recipes/ljspeech/hifigan/train_hifigan.py
@ -41,11 +41,6 @@ model = GAN(config, ap)

 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
+++ b/recipes/ljspeech/multiband_melgan/train_multiband_melgan.py
@ -41,11 +41,6 @@ model = GAN(config, ap)

 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py
+++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
@ -67,7 +67,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init model
 model = ForwardTTS(config, ap, tokenizer)
--- a/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
+++ b/recipes/ljspeech/tacotron2-DCA/train_tacotron_dca.py
@ -77,7 +77,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
--- a/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
+++ b/recipes/ljspeech/tacotron2-DDC/train_tacotron_ddc.py
@ -74,7 +74,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # INITIALIZE THE MODEL
 # Models take a config object and a speaker manager as input
@ -84,11 +89,6 @@ model = Tacotron2(config, ap, tokenizer, speaker_manager=None)

 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/ljspeech/univnet/train.py
+++ b/recipes/ljspeech/univnet/train.py
@ -40,11 +40,6 @@ model = GAN(config, ap)

 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/ljspeech/vits_tts/train_vits.py
+++ b/recipes/ljspeech/vits_tts/train_vits.py
@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init model
 model = Vits(config, ap, tokenizer, speaker_manager=None)
--- a/recipes/multilingual/vits_tts/train_vits_tts.py
+++ b/recipes/multilingual/vits_tts/train_vits_tts.py
@ -6,12 +6,11 @@ from trainer import Trainer, TrainerArgs
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
-from TTS.tts.models.vits import CharactersConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.models.vits import Vits, VitsArgs
+from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs
 from TTS.tts.utils.languages import LanguageManager
-from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.speakers import SpeakerManager
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor

 output_path = os.path.dirname(os.path.abspath(__file__))
@ -110,7 +109,12 @@ config.from_dict(config.to_dict())
 ap = AudioProcessor(**config.audio.to_dict())

 # load training samples
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
@ -131,11 +135,6 @@ model = Vits(config, ap, tokenizer, speaker_manager, language_manager)

 # init the trainer and 🚀
 trainer = Trainer(
-    TrainerArgs(),
-    config,
-    output_path,
-    model=model,
-    train_samples=train_samples,
-    eval_samples=eval_samples
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
 )
 trainer.fit()
--- a/recipes/vctk/fast_pitch/train_fast_pitch.py
+++ b/recipes/vctk/fast_pitch/train_fast_pitch.py
@ -71,7 +71,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
--- a/recipes/vctk/fast_speech/train_fast_speech.py
+++ b/recipes/vctk/fast_speech/train_fast_speech.py
@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
--- a/recipes/vctk/glow_tts/train_glow_tts.py
+++ b/recipes/vctk/glow_tts/train_glow_tts.py
@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
--- a/recipes/vctk/speedy_speech/train_speedy_speech.py
+++ b/recipes/vctk/speedy_speech/train_speedy_speech.py
@ -69,7 +69,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
--- a/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
+++ b/recipes/vctk/tacotron-DDC/train_tacotron-DDC.py
@ -72,7 +72,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it mainly handles speaker-id to speaker-name for the model and the data-loader
--- a/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
+++ b/recipes/vctk/tacotron2-DDC/train_tacotron2-ddc.py
@ -78,7 +78,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it mainly handles speaker-id to speaker-name for the model and the data-loader
--- a/recipes/vctk/tacotron2/train_tacotron2.py
+++ b/recipes/vctk/tacotron2/train_tacotron2.py
@ -78,7 +78,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it mainly handles speaker-id to speaker-name for the model and the data-loader
--- a/recipes/vctk/vits/train_vits.py
+++ b/recipes/vctk/vits/train_vits.py
@ -79,7 +79,12 @@ tokenizer, config = TTSTokenizer.init_from_config(config)
 # You can define your custom sample loader returning the list of samples.
 # Or define your custom formatter and pass it to the `load_tts_samples`.
 # Check `TTS.tts.datasets.load_tts_samples` for more details.
-train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)

 # init speaker manager for multi-speaker training
 # it maps speaker-id to speaker-name in the model and data-loader
--- a/requirements.txt
+++ b/requirements.txt
@ -33,6 +33,6 @@ pypinyin
 mecab-python3==1.0.3
 unidic-lite==1.0.8
 # gruut+supported langs
-gruut[cs,de,es,fr,it,nl,pt,ru,sv]~=2.0.0
+gruut[cs,de,es,fr,it,nl,pt,ru,sv]==2.2.3
 # others
 webrtcvad # for VAD
--- a/tests/aux_tests/test_extract_tts_spectrograms.py
+++ b/tests/aux_tests/test_extract_tts_spectrograms.py
@ -15,7 +15,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
    def test_GlowTTS():
        # set paths
        config_path = os.path.join(get_tests_input_path(), "test_glow_tts.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar")
+        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
        # load config
        c = load_config(config_path)
@ -33,7 +33,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
    def test_Tacotron2():
        # set paths
        config_path = os.path.join(get_tests_input_path(), "test_tacotron2_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar")
+        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
        # load config
        c = load_config(config_path)
@ -51,7 +51,7 @@ class TestExtractTTSSpectrograms(unittest.TestCase):
    def test_Tacotron():
        # set paths
        config_path = os.path.join(get_tests_input_path(), "test_tacotron_config.json")
-        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth.tar")
+        checkpoint_path = os.path.join(get_tests_output_path(), "checkpoint_test.pth")
        output_path = os.path.join(get_tests_output_path(), "output_extract_tts_spectrograms/")
        # load config
        c = load_config(config_path)
--- a/tests/aux_tests/test_speaker_manager.py
+++ b/tests/aux_tests/test_speaker_manager.py
@ -12,7 +12,7 @@ from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.audio import AudioProcessor

 encoder_config_path = os.path.join(get_tests_input_path(), "test_speaker_encoder_config.json")
-encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth.tar")
+encoder_model_path = os.path.join(get_tests_input_path(), "checkpoint_0.pth")
 sample_wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0001.wav")
 sample_wav_path2 = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs/LJ001-0002.wav")
 d_vectors_file_path = os.path.join(get_tests_input_path(), "../data/dummy_speakers.json")
--- a/tests/data_tests/test_samplers.py
+++ b/tests/data_tests/test_samplers.py
@ -1,14 +1,13 @@
 import functools
-
 import unittest

 import torch

 from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.encoder.utils.samplers import PerfectBatchSampler
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.utils.languages import get_language_balancer_weights
 from TTS.tts.utils.speakers import get_speaker_balancer_weights
-from TTS.encoder.utils.samplers import PerfectBatchSampler

 # Fixing random state to avoid random fails
 torch.manual_seed(0)
@ -60,7 +59,9 @@ class TestSamplers(unittest.TestCase):
        assert not is_balanced(en, pt), "Random sampler is supposed to be unbalanced"

    def test_language_weighted_random_sampler(self):  # pylint: disable=no-self-use
-        weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_language_balancer_weights(train_samples), len(train_samples))
+        weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(
+            get_language_balancer_weights(train_samples), len(train_samples)
+        )
        ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
        en, pt = 0, 0
        for index in ids:
@ -73,7 +74,9 @@ class TestSamplers(unittest.TestCase):

    def test_speaker_weighted_random_sampler(self):  # pylint: disable=no-self-use

-        weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(get_speaker_balancer_weights(train_samples), len(train_samples))
+        weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(
+            get_speaker_balancer_weights(train_samples), len(train_samples)
+        )
        ids = functools.reduce(lambda a, b: a + b, [list(weighted_sampler) for i in range(100)])
        spk1, spk2 = 0, 0
        for index in ids:
@ -92,11 +95,12 @@ class TestSamplers(unittest.TestCase):
        sampler = PerfectBatchSampler(
            train_samples,
            classes,
-            batch_size=2 * 3, # total batch size
+            batch_size=2 * 3,  # total batch size
            num_classes_in_batch=2,
            label_key="speaker_name",
            shuffle=False,
-            drop_last=True)
+            drop_last=True,
+        )
        batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)])
        for batch in batchs:
            spk1, spk2 = 0, 0
@ -116,11 +120,12 @@ class TestSamplers(unittest.TestCase):
        sampler = PerfectBatchSampler(
            train_samples,
            classes,
-            batch_size=2 * 3, # total batch size
+            batch_size=2 * 3,  # total batch size
            num_classes_in_batch=2,
            label_key="speaker_name",
            shuffle=True,
-            drop_last=False)
+            drop_last=False,
+        )
        batchs = functools.reduce(lambda a, b: a + b, [list(sampler) for i in range(100)])
        for batch in batchs:
            spk1, spk2 = 0, 0
--- a/tests/inference_tests/test_synthesizer.py
+++ b/tests/inference_tests/test_synthesizer.py
@ -20,7 +20,7 @@ class SynthesizerTest(unittest.TestCase):
    def test_in_out(self):
        self._create_random_model()
        tts_root_path = get_tests_output_path()
-        tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth.tar")
+        tts_checkpoint = os.path.join(tts_root_path, "checkpoint_10.pth")
        tts_config = os.path.join(tts_root_path, "dummy_model_config.json")
        synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None)
        synthesizer.tts("Better this test works!!")
--- a/tests/inputs/server_config.json
+++ b/tests/inputs/server_config.json
@ -1,5 +1,5 @@
 {
-    "tts_checkpoint":"checkpoint_10.pth.tar",     // tts checkpoint file
+    "tts_checkpoint":"checkpoint_10.pth",     // tts checkpoint file
    "tts_config":"dummy_model_config.json",     // tts config.json file
    "tts_speakers": null,           // json file listing speaker ids. null if no speaker embedding.
    "wavernn_lib_path": null,   // Rootpath to wavernn project folder to be imported. If this is null, model uses GL for speech synthesis.