Merge pull request #674 from coqui-ai/dev

v0.1.3
2021-07-26 18:36:56 +02:00 · 2021-07-26 18:36:56 +02:00 · d0292dd2d1
parent 8fbadad68e febd6105b5
commit d0292dd2d1
30 changed files with 238 additions and 653 deletions
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -6,6 +6,7 @@ This repository is governed by [the Contributor Covenant Code of Conduct](https:
 ## Where to start.
 We welcome everyone who likes to contribute to 🐸TTS.
 You can contribute not only with code but with bug reports, comments, questions, answers, or just a simple tweet to spread the word.
 If you like to contribute code, squash a bug but if you don't know where to start, here are some pointers.
@ -25,6 +26,16 @@ If you like to contribute code, squash a bug but if you don't know where to star
    We list all the target improvements for the next version. You can pick one of them and start contributing.
 - Also feel free to suggest new features, ideas and models. We're always open for new things.
 #####Call for sharing language models
 If possible, please consider sharing your pre-trained models in any language (if the licences allow for you to do so). We will include them in our model catalogue for public use and give the proper attribution, whether it be your name, company, website or any other source specified.
 This model can be shared in two ways:
 1. Share the model files with us and we serve them with the next 🐸 TTS release.
 2. Upload your models on GDrive and share the link.
 Models are served under `.models.json` file and any model is available under TTS CLI or Server end points.
 Either way you choose, please make sure you send the models [here](https://github.com/coqui-ai/TTS/issues/380). 
 ## Sending a ✨**PR**✨
 If you have a new feature, a model to implement, or a bug to squash, go ahead and send a ✨**PR**✨.
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -132,7 +132,7 @@
            "thorsten":{
                "tacotron2-DCA":{
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip",
-                    "default_vocoder": "vocoder_models/de/thorsten/wavegrad",
+                    "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
                    "author": "@thorstenMueller",
                    "commit": "unknown"
                }
@ -230,6 +230,11 @@
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip",
                    "author": "@thorstenMueller",
                    "commit": "unknown"
                },
                "fullband-melgan":{
                    "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip",
                    "author": "@thorstenMueller",
                    "commit": "unknown"
                }
            }
        }
--- a/TTS/bin/compute_embeddings.py
+++ b/TTS/bin/compute_embeddings.py
@ -1,80 +1,47 @@
 import argparse
 import glob
 import os
 from argparse import RawTextHelpFormatter
 import torch
 from tqdm import tqdm
-from TTS.config import BaseDatasetConfig, load_config
+from TTS.config import load_config
 from TTS.speaker_encoder.utils.generic_utils import setup_model
 from TTS.tts.datasets import load_meta_data
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.utils.audio import AudioProcessor
 parser = argparse.ArgumentParser(
-    description='Compute embedding vectors for each wav file in a dataset. If "target_dataset" is defined, it generates "speakers.json" necessary for training a multi-speaker model.'
+    description="""Compute embedding vectors for each wav file in a dataset.\n\n"""
    """
    Example runs:
    python TTS/bin/compute_embeddings.py speaker_encoder_model.pth.tar speaker_encoder_config.json  dataset_config.json embeddings_output_path/
    """,
    formatter_class=RawTextHelpFormatter,
 )
-parser.add_argument("model_path", type=str, help="Path to model outputs (checkpoint, tensorboard etc.).")
+parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
 parser.add_argument(
    "config_path",
    type=str,
-    help="Path to config file for training.",
+    help="Path to model config file.",
 )
-parser.add_argument("data_path", type=str, help="Data path for wav files - directory or CSV file")
+
 parser.add_argument("output_path", type=str, help="path for output speakers.json.")
 parser.add_argument(
-    "--target_dataset",
+    "config_dataset_path",
    type=str,
-    default="",
+    help="Path to dataset config file.",
    help="Target dataset to pick a processor from TTS.tts.dataset.preprocess. Necessary to create a speakers.json file.",
 )
 parser.add_argument("output_path", type=str, help="path for output speakers.json and/or speakers.npy.")
 parser.add_argument("--use_cuda", type=bool, help="flag to set cuda.", default=True)
-parser.add_argument("--separator", type=str, help="Separator used in file if CSV is passed for data_path", default="|")
+parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
 args = parser.parse_args()
 c_dataset = load_config(args.config_dataset_path)
-c = load_config(args.config_path)
+meta_data_train, meta_data_eval = load_meta_data(c_dataset.datasets, eval_split=args.eval)
-ap = AudioProcessor(**c["audio"])
+wav_files = meta_data_train + meta_data_eval
-data_path = args.data_path
+speaker_manager = SpeakerManager(
-split_ext = os.path.splitext(data_path)
+    encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
-sep = args.separator
+)
 if args.target_dataset != "":
    # if target dataset is defined
    dataset_config = [
        BaseDatasetConfig(name=args.target_dataset, path=args.data_path, meta_file_train=None, meta_file_val=None),
    ]
    wav_files, _ = load_meta_data(dataset_config, eval_split=False)
 else:
    # if target dataset is not defined
    if len(split_ext) > 0 and split_ext[1].lower() == ".csv":
        # Parse CSV
        print(f"CSV file: {data_path}")
        with open(data_path) as f:
            wav_path = os.path.join(os.path.dirname(data_path), "wavs")
            wav_files = []
            print(f"Separator is: {sep}")
            for line in f:
                components = line.split(sep)
                if len(components) != 2:
                    print("Invalid line")
                    continue
                wav_file = os.path.join(wav_path, components[0] + ".wav")
                # print(f'wav_file: {wav_file}')
                if os.path.exists(wav_file):
                    wav_files.append(wav_file)
        print(f"Count of wavs imported: {len(wav_files)}")
    else:
        # Parse all wav files in data_path
        wav_files = glob.glob(data_path + "/**/*.wav", recursive=True)
 # define Encoder model
 model = setup_model(c)
 model.load_state_dict(torch.load(args.model_path)["model"])
 model.eval()
 if args.use_cuda:
    model.cuda()
 # compute speaker embeddings
 speaker_mapping = {}
@ -85,18 +52,14 @@ for idx, wav_file in enumerate(tqdm(wav_files)):
    else:
        speaker_name = None
-    mel_spec = ap.melspectrogram(ap.load_wav(wav_file, sr=ap.sample_rate)).T
+    # extract the embedding
-    mel_spec = torch.FloatTensor(mel_spec[None, :, :])
+    embedd = speaker_manager.compute_d_vector_from_clip(wav_file)
    if args.use_cuda:
        mel_spec = mel_spec.cuda()
    embedd = model.compute_embedding(mel_spec)
    embedd = embedd.detach().cpu().numpy()
    # create speaker_mapping if target dataset is defined
    wav_file_name = os.path.basename(wav_file)
    speaker_mapping[wav_file_name] = {}
    speaker_mapping[wav_file_name]["name"] = speaker_name
-    speaker_mapping[wav_file_name]["embedding"] = embedd.flatten().tolist()
+    speaker_mapping[wav_file_name]["embedding"] = embedd
 if speaker_mapping:
    # save speaker_mapping if target dataset is defined
@ -104,8 +67,9 @@ if speaker_mapping:
        mapping_file_path = os.path.join(args.output_path, "speakers.json")
    else:
        mapping_file_path = args.output_path
    os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
-    speaker_manager = SpeakerManager()
+
    # pylint: disable=W0212
    speaker_manager._save_json(mapping_file_path, speaker_mapping)
    print("Speaker embeddings saved at:", mapping_file_path)
--- a/TTS/bin/extract_tts_spectrograms.py
+++ b/TTS/bin/extract_tts_spectrograms.py
@ -227,7 +227,7 @@ def main(args):  # pylint: disable=redefined-outer-name
    ap = AudioProcessor(**c.audio)
    # load data instances
-    meta_data_train, meta_data_eval = load_meta_data(c.datasets)
+    meta_data_train, meta_data_eval = load_meta_data(c.datasets, eval_split=args.eval)
    # use eval and training partitions
    meta_data = meta_data_train + meta_data_eval
@ -271,6 +271,7 @@ if __name__ == "__main__":
    parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
    parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
    parser.add_argument("--quantized", action="store_true", help="Save quantized audio files")
    parser.add_argument("--eval", type=bool, help="compute eval.", default=True)
    args = parser.parse_args()
    c = load_config(args.config_path)
--- a/TTS/bin/find_unique_chars.py
+++ b/TTS/bin/find_unique_chars.py
@ -1,40 +1,41 @@
 """Find all the unique characters in a dataset"""
 import argparse
 import os
 from argparse import RawTextHelpFormatter
-from TTS.tts.datasets.formatters import get_preprocessor_by_name
+from TTS.config import load_config
 from TTS.tts.datasets import load_meta_data
 def main():
    # pylint: disable=bad-option-value
    parser = argparse.ArgumentParser(
        description="""Find all the unique characters or phonemes in a dataset.\n\n"""
        """Target dataset must be defined in TTS.tts.datasets.formatters\n\n"""
        """
    Example runs:
-    python TTS/bin/find_unique_chars.py --dataset ljspeech --meta_file /path/to/LJSpeech/metadata.csv
+    python TTS/bin/find_unique_chars.py --config_path config.json
    """,
        formatter_class=RawTextHelpFormatter,
    )
-
+    parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
    parser.add_argument(
        "--dataset", type=str, default="", help="One of the target dataset names in TTS.tts.datasets.formatters."
    )
    parser.add_argument("--meta_file", type=str, default=None, help="Path to the transcriptions file of the dataset.")
    args = parser.parse_args()
-    preprocessor = get_preprocessor_by_name(args.dataset)
+    c = load_config(args.config_path)
-    items = preprocessor(os.path.dirname(args.meta_file), os.path.basename(args.meta_file))
+
    # load all datasets
    train_items, eval_items = load_meta_data(c.datasets, eval_split=True)
    items = train_items + eval_items
    texts = "".join(item[0] for item in items)
    chars = set(texts)
    lower_chars = filter(lambda c: c.islower(), chars)
    chars_force_lower = [c.lower() for c in chars]
    chars_force_lower = set(chars_force_lower)
    print(f" > Number of unique characters: {len(chars)}")
    print(f" > Unique characters: {''.join(sorted(chars))}")
    print(f" > Unique lower characters: {''.join(sorted(lower_chars))}")
    print(f" > Unique all forced to lower characters: {''.join(sorted(chars_force_lower))}")
 if __name__ == "__main__":
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@ -164,7 +164,7 @@ def main(args):  # pylint: disable=redefined-outer-name
    elif c.loss == "angleproto":
        criterion = AngleProtoLoss()
    elif c.loss == "softmaxproto":
-        criterion = SoftmaxAngleProtoLoss(c.model["proj_dim"], num_speakers)
+        criterion = SoftmaxAngleProtoLoss(c.model_params["proj_dim"], num_speakers)
    else:
        raise Exception("The %s  not is a loss supported" % c.loss)
--- a/TTS/server/server.py
+++ b/TTS/server/server.py
@ -103,7 +103,8 @@ synthesizer = Synthesizer(
    model_path, config_path, speakers_file_path, vocoder_path, vocoder_config_path, use_cuda=args.use_cuda
 )
-use_multi_speaker = synthesizer.speaker_manager is not None
+use_multi_speaker = synthesizer.tts_model.speaker_manager is not None and synthesizer.tts_model.num_speakers > 1
 speaker_manager = synthesizer.tts_model.speaker_manager if hasattr(synthesizer.tts_model, "speaker_manager") else None
 # TODO: set this from SpeakerManager
 use_gst = synthesizer.tts_config.get("use_gst", False)
 app = Flask(__name__)
@ -134,7 +135,7 @@ def index():
        "index.html",
        show_details=args.show_details,
        use_multi_speaker=use_multi_speaker,
-        speaker_ids=synthesizer.speaker_manager.speaker_ids if synthesizer.speaker_manager else None,
+        speaker_ids=speaker_manager.speaker_ids if speaker_manager is not None else None,
        use_gst=use_gst,
    )
--- a/TTS/speaker_encoder/models/lstm.py
+++ b/TTS/speaker_encoder/models/lstm.py
@ -1,3 +1,4 @@
 import numpy as np
 import torch
 from torch import nn
@ -70,24 +71,32 @@ class LSTMSpeakerEncoder(nn.Module):
            d = torch.nn.functional.normalize(d, p=2, dim=1)
        return d
-    def compute_embedding(self, x, num_frames=160, overlap=0.5):
+    def compute_embedding(self, x, num_frames=250, num_eval=10, return_mean=True):
        """
        Generate embeddings for a batch of utterances
        x: 1xTxD
        """
        num_overlap = int(num_frames * overlap)
        max_len = x.shape[1]
-        embed = None
+
-        cur_iter = 0
+        if max_len < num_frames:
-        for offset in range(0, max_len, num_frames - num_overlap):
+            num_frames = max_len
-            cur_iter += 1
+
-            end_offset = min(x.shape[1], offset + num_frames)
+        offsets = np.linspace(0, max_len - num_frames, num=num_eval)
        frames_batch = []
        for offset in offsets:
            offset = int(offset)
            end_offset = int(offset + num_frames)
            frames = x[:, offset:end_offset]
-            if embed is None:
+            frames_batch.append(frames)
-                embed = self.inference(frames)
+
-            else:
+        frames_batch = torch.cat(frames_batch, dim=0)
-                embed += self.inference(frames)
+        embeddings = self.inference(frames_batch)
-        return embed / cur_iter
+
        if return_mean:
            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
        return embeddings
    def batch_compute_embedding(self, x, seq_lens, num_frames=160, overlap=0.5):
        """
@ -110,9 +119,11 @@ class LSTMSpeakerEncoder(nn.Module):
        return embed / num_iters
    # pylint: disable=unused-argument, redefined-builtin
-    def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False):
+    def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if use_cuda:
            self.cuda()
        if eval:
            self.eval()
            assert not self.training
--- a/TTS/speaker_encoder/models/resnet.py
+++ b/TTS/speaker_encoder/models/resnet.py
@ -199,3 +199,12 @@ class ResNetSpeakerEncoder(nn.Module):
            embeddings = torch.mean(embeddings, dim=0, keepdim=True)
        return embeddings
    def load_checkpoint(self, config: dict, checkpoint_path: str, eval: bool = False, use_cuda: bool = False):
        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
        self.load_state_dict(state["model"])
        if use_cuda:
            self.cuda()
        if eval:
            self.eval()
            assert not self.training
--- a/TTS/trainer.py
+++ b/TTS/trainer.py
@ -764,11 +764,11 @@ class Trainer:
        """Run test and log the results. Test run must be defined by the model.
        Model must return figures and audios to be logged by the Tensorboard."""
        if hasattr(self.model, "test_run"):
-            if hasattr(self.eval_loader.load_test_samples):
+            if hasattr(self.eval_loader.dataset, "load_test_samples"):
-                samples = self.eval_loader.load_test_samples(1)
+                samples = self.eval_loader.dataset.load_test_samples(1)
-                figures, audios = self.model.test_run(samples)
+                figures, audios = self.model.test_run(self.ap, samples, None)
            else:
-                figures, audios = self.model.test_run()
+                figures, audios = self.model.test_run(self.ap)
            self.tb_logger.tb_test_audios(self.total_steps_done, audios, self.config.audio["sample_rate"])
            self.tb_logger.tb_test_figures(self.total_steps_done, figures)
@ -790,7 +790,7 @@ class Trainer:
            self.train_epoch()
            if self.config.run_eval:
                self.eval_epoch()
-            if epoch >= self.config.test_delay_epochs and self.args.rank < 0:
+            if epoch >= self.config.test_delay_epochs and self.args.rank <= 0:
                self.test_run()
            self.c_logger.print_epoch_end(
                epoch, self.keep_avg_eval.avg_values if self.config.run_eval else self.keep_avg_train.avg_values
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@ -202,16 +202,20 @@ def libri_tts(root_path, meta_files=None):
    items = []
    if meta_files is None:
        meta_files = glob(f"{root_path}/**/*trans.tsv", recursive=True)
    else:
        if isinstance(meta_files, str):
            meta_files = [os.path.join(root_path, meta_files)]
    for meta_file in meta_files:
        _meta_file = os.path.basename(meta_file).split(".")[0]
        speaker_name = _meta_file.split("_")[0]
        chapter_id = _meta_file.split("_")[1]
        _root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}")
        with open(meta_file, "r") as ttf:
            for line in ttf:
                cols = line.split("\t")
-                wav_file = os.path.join(_root_path, cols[0] + ".wav")
+                file_name = cols[0]
-                text = cols[1]
+                speaker_name, chapter_id, *_ = cols[0].split("_")
                _root_path = os.path.join(root_path, f"{speaker_name}/{chapter_id}")
                wav_file = os.path.join(_root_path, file_name + ".wav")
                text = cols[2]
                items.append([text, wav_file, "LTTS_" + speaker_name])
    for item in items:
        assert os.path.exists(item[1]), f" [!] wav files don't exist - {item[1]}"
@ -288,6 +292,19 @@ def vctk_slim(root_path, meta_files=None, wavs_path="wav48"):
    return items
 def mls(root_path, meta_files=None):
    """http://www.openslr.org/94/"""
    items = []
    with open(os.path.join(root_path, meta_files), "r") as meta:
        for line in meta:
            file, text = line.split("\t")
            text = text[:-1]
            speaker, book, *_ = file.split("_")
            wav_file = os.path.join(root_path, os.path.dirname(meta_files), "audio", speaker, book, file + ".wav")
            items.append([text, wav_file, "MLS_" + speaker])
    return items
 # ======================================== VOX CELEB ===========================================
 def voxceleb2(root_path, meta_file=None):
    """
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@ -246,9 +246,9 @@ class Huber(nn.Module):
 class TacotronLoss(torch.nn.Module):
    """Collection of Tacotron set-up based on provided config."""
-    def __init__(self, c, stopnet_pos_weight=10, ga_sigma=0.4):
+    def __init__(self, c, ga_sigma=0.4):
        super().__init__()
-        self.stopnet_pos_weight = stopnet_pos_weight
+        self.stopnet_pos_weight = c.stopnet_pos_weight
        self.ga_alpha = c.ga_alpha
        self.decoder_diff_spec_alpha = c.decoder_diff_spec_alpha
        self.postnet_diff_spec_alpha = c.postnet_diff_spec_alpha
@ -274,7 +274,7 @@ class TacotronLoss(torch.nn.Module):
            self.criterion_ssim = SSIMLoss()
        # stopnet loss
        # pylint: disable=not-callable
-        self.criterion_st = BCELossMasked(pos_weight=torch.tensor(stopnet_pos_weight)) if c.stopnet else None
+        self.criterion_st = BCELossMasked(pos_weight=torch.tensor(self.stopnet_pos_weight)) if c.stopnet else None
    def forward(
        self,
@ -284,6 +284,7 @@ class TacotronLoss(torch.nn.Module):
        linear_input,
        stopnet_output,
        stopnet_target,
        stop_target_length,
        output_lens,
        decoder_b_output,
        alignments,
@ -315,11 +316,11 @@ class TacotronLoss(torch.nn.Module):
        return_dict["decoder_loss"] = decoder_loss
        return_dict["postnet_loss"] = postnet_loss
        # stopnet loss
        stop_loss = (
-            self.criterion_st(stopnet_output, stopnet_target, output_lens) if self.config.stopnet else torch.zeros(1)
+            self.criterion_st(stopnet_output, stopnet_target, stop_target_length)
            if self.config.stopnet
            else torch.zeros(1)
        )
        if not self.config.separate_stopnet and self.config.stopnet:
        loss += stop_loss
        return_dict["stopnet_loss"] = stop_loss
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@ -70,7 +70,7 @@ class BaseTTS(BaseModel):
    def get_aux_input(self, **kwargs) -> Dict:
        """Prepare and return `aux_input` used by `forward()`"""
-        pass
+        return {"speaker_id": None, "style_wav": None, "d_vector": None}
    def format_batch(self, batch: Dict) -> Dict:
        """Generic batch formatting for `TTSDataset`.
@ -119,9 +119,10 @@ class BaseTTS(BaseModel):
                ), f" [!] total duration {dur.sum()} vs spectrogram length {mel_lengths[idx]}"
                durations[idx, : text_lengths[idx]] = dur
-        # set stop targets view, we predict a single stop token per iteration.
+        # set stop targets wrt reduction factor
        stop_targets = stop_targets.view(text_input.shape[0], stop_targets.size(1) // self.config.r, -1)
        stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2)
        stop_target_lengths = torch.divide(mel_lengths, self.config.r).ceil_()
        return {
            "text_input": text_input,
@ -131,6 +132,7 @@ class BaseTTS(BaseModel):
            "mel_lengths": mel_lengths,
            "linear_input": linear_input,
            "stop_targets": stop_targets,
            "stop_target_lengths": stop_target_lengths,
            "attn_mask": attn_mask,
            "durations": durations,
            "speaker_ids": speaker_ids,
@ -200,7 +202,7 @@ class BaseTTS(BaseModel):
            )
        return loader
-    def test_run(self) -> Tuple[Dict, Dict]:
+    def test_run(self, ap) -> Tuple[Dict, Dict]:
        """Generic test run for `tts` models used by `Trainer`.
        You can override this for a different behaviour.
@ -212,14 +214,14 @@ class BaseTTS(BaseModel):
        test_audios = {}
        test_figures = {}
        test_sentences = self.config.test_sentences
-        aux_inputs = self._get_aux_inputs()
+        aux_inputs = self.get_aux_input()
        for idx, sen in enumerate(test_sentences):
            wav, alignment, model_outputs, _ = synthesis(
-                self.model,
+                self,
                sen,
                self.config,
-                self.use_cuda,
+                "cuda" in str(next(self.parameters()).device),
-                self.ap,
+                ap,
                speaker_id=aux_inputs["speaker_id"],
                d_vector=aux_inputs["d_vector"],
                style_wav=aux_inputs["style_wav"],
@ -229,6 +231,6 @@ class BaseTTS(BaseModel):
            ).values()
            test_audios["{}-audio".format(idx)] = wav
-            test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, self.ap, output_fig=False)
+            test_figures["{}-prediction".format(idx)] = plot_spectrogram(model_outputs, ap, output_fig=False)
            test_figures["{}-alignment".format(idx)] = plot_alignment(alignment, output_fig=False)
        return test_figures, test_audios
--- a/TTS/tts/models/tacotron.py
+++ b/TTS/tts/models/tacotron.py
@ -219,6 +219,7 @@ class Tacotron(BaseTacotron):
        mel_lengths = batch["mel_lengths"]
        linear_input = batch["linear_input"]
        stop_targets = batch["stop_targets"]
        stop_target_lengths = batch["stop_target_lengths"]
        speaker_ids = batch["speaker_ids"]
        d_vectors = batch["d_vectors"]
@ -250,6 +251,7 @@ class Tacotron(BaseTacotron):
            linear_input,
            outputs["stop_tokens"],
            stop_targets,
            stop_target_lengths,
            mel_lengths,
            outputs["decoder_outputs_backward"],
            outputs["alignments"],
--- a/TTS/tts/models/tacotron2.py
+++ b/TTS/tts/models/tacotron2.py
@ -224,6 +224,7 @@ class Tacotron2(BaseTacotron):
        mel_lengths = batch["mel_lengths"]
        linear_input = batch["linear_input"]
        stop_targets = batch["stop_targets"]
        stop_target_lengths = batch["stop_target_lengths"]
        speaker_ids = batch["speaker_ids"]
        d_vectors = batch["d_vectors"]
@ -255,6 +256,7 @@ class Tacotron2(BaseTacotron):
            linear_input,
            outputs["stop_tokens"],
            stop_targets,
            stop_target_lengths,
            mel_lengths,
            outputs["decoder_outputs_backward"],
            outputs["alignments"],
--- a/TTS/tts/utils/data.py
+++ b/TTS/tts/utils/data.py
@ -27,10 +27,19 @@ def prepare_tensor(inputs, out_steps):
    return np.stack([_pad_tensor(x, pad_len) for x in inputs])
-def _pad_stop_target(x, length):
+def _pad_stop_target(x: np.ndarray, length: int, pad_val=1) -> np.ndarray:
-    _pad = 0.0
+    """Pad stop target array.
    Args:
        x (np.ndarray): Stop target array.
        length (int): Length after padding.
        pad_val (int, optional): Padding value. Defaults to 1.
    Returns:
        np.ndarray: Padded stop target array.
    """
    assert x.ndim == 1
-    return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=_pad)
+    return np.pad(x, (0, length - x.shape[0]), mode="constant", constant_values=pad_val)
 def prepare_stop_target(inputs, out_steps):
--- a/TTS/tts/utils/speakers.py
+++ b/TTS/tts/utils/speakers.py
@ -59,6 +59,7 @@ class SpeakerManager:
        speaker_id_file_path: str = "",
        encoder_model_path: str = "",
        encoder_config_path: str = "",
        use_cuda: bool = False,
    ):
        self.data_items = []
@ -67,6 +68,7 @@ class SpeakerManager:
        self.clip_ids = []
        self.speaker_encoder = None
        self.speaker_encoder_ap = None
        self.use_cuda = use_cuda
        if data_items:
            self.speaker_ids, self.speaker_names, _ = self.parse_speakers_from_data(self.data_items)
@ -222,11 +224,11 @@ class SpeakerManager:
        """
        self.speaker_encoder_config = load_config(config_path)
        self.speaker_encoder = setup_model(self.speaker_encoder_config)
-        self.speaker_encoder.load_checkpoint(config_path, model_path, True)
+        self.speaker_encoder.load_checkpoint(config_path, model_path, eval=True, use_cuda=self.use_cuda)
        self.speaker_encoder_ap = AudioProcessor(**self.speaker_encoder_config.audio)
        # normalize the input audio level and trim silences
-        self.speaker_encoder_ap.do_sound_norm = True
+        # self.speaker_encoder_ap.do_sound_norm = True
-        self.speaker_encoder_ap.do_trim_silence = True
+        # self.speaker_encoder_ap.do_trim_silence = True
    def compute_d_vector_from_clip(self, wav_file: Union[str, list]) -> list:
        """Compute a d_vector from a given audio file.
@ -242,6 +244,8 @@ class SpeakerManager:
            waveform = self.speaker_encoder_ap.load_wav(wav_file, sr=self.speaker_encoder_ap.sample_rate)
            spec = self.speaker_encoder_ap.melspectrogram(waveform)
            spec = torch.from_numpy(spec.T)
            if self.use_cuda:
                spec = spec.cuda()
            spec = spec.unsqueeze(0)
            d_vector = self.speaker_encoder.compute_embedding(spec)
            return d_vector
@ -272,6 +276,8 @@ class SpeakerManager:
            feats = torch.from_numpy(feats)
        if feats.ndim == 2:
            feats = feats.unsqueeze(0)
        if self.use_cuda:
            feats = feats.cuda()
        return self.speaker_encoder.compute_embedding(feats)
    def run_umap(self):
--- a/TTS/vocoder/datasets/wavegrad_dataset.py
+++ b/TTS/vocoder/datasets/wavegrad_dataset.py
@ -2,6 +2,7 @@ import glob
 import os
 import random
 from multiprocessing import Manager
 from typing import List, Tuple
 import numpy as np
 import torch
@ -67,7 +68,19 @@ class WaveGradDataset(Dataset):
        item = self.load_item(idx)
        return item
-    def load_test_samples(self, num_samples):
+    def load_test_samples(self, num_samples: int) -> List[Tuple]:
        """Return test samples.
        Args:
            num_samples (int): Number of samples to return.
        Returns:
            List[Tuple]: melspectorgram and audio.
        Shapes:
            - melspectrogram (Tensor): :math:`[C, T]`
            - audio (Tensor): :math:`[T_audio]`
        """
        samples = []
        return_segments = self.return_segments
        self.return_segments = False
--- a/TTS/vocoder/models/wavegrad.py
+++ b/TTS/vocoder/models/wavegrad.py
@ -124,11 +124,16 @@ class Wavegrad(BaseModel):
    @torch.no_grad()
    def inference(self, x, y_n=None):
-        """x: B x D X T"""
+        """
        Shapes:
            x: :math:`[B, C , T]`
            y_n: :math:`[B, 1, T]`
        """
        if y_n is None:
-            y_n = torch.randn(x.shape[0], 1, self.hop_len * x.shape[-1], dtype=torch.float32).to(x)
+            y_n = torch.randn(x.shape[0], 1, self.hop_len * x.shape[-1])
        else:
-            y_n = torch.FloatTensor(y_n).unsqueeze(0).unsqueeze(0).to(x)
+            y_n = torch.FloatTensor(y_n).unsqueeze(0).unsqueeze(0)
        y_n = y_n.type_as(x)
        sqrt_alpha_hat = self.noise_level.to(x)
        for n in range(len(self.alpha) - 1, -1, -1):
            y_n = self.c1[n] * (y_n - self.c2[n] * self.forward(y_n, x, sqrt_alpha_hat[n].repeat(x.shape[0])))
@ -267,8 +272,10 @@ class Wavegrad(BaseModel):
        betas = np.linspace(noise_schedule["min_val"], noise_schedule["max_val"], noise_schedule["num_steps"])
        self.compute_noise_level(betas)
        for sample in samples:
-            x = sample["input"]
+            x = sample[0]
-            y = sample["waveform"]
+            x = x[None, :, :].to(next(self.parameters()).device)
            y = sample[1]
            y = y[None, :]
            # compute voice
            y_pred = self.inference(x)
            # compute spectrograms
--- a/TTS/vocoder/models/wavernn.py
+++ b/TTS/vocoder/models/wavernn.py
@ -322,7 +322,7 @@ class Wavernn(BaseVocoder):
        with torch.no_grad():
            if isinstance(mels, np.ndarray):
-                mels = torch.FloatTensor(mels).type_as(mels)
+                mels = torch.FloatTensor(mels).to(str(next(self.parameters()).device))
            if mels.ndim == 2:
                mels = mels.unsqueeze(0)
@ -576,7 +576,8 @@ class Wavernn(BaseVocoder):
        figures = {}
        audios = {}
        for idx, sample in enumerate(samples):
-            x = sample["input"]
+            x = torch.FloatTensor(sample[0])
            x = x.to(next(self.parameters()).device)
            y_hat = self.inference(x, self.config.batched, self.config.target_samples, self.config.overlap_samples)
            x_hat = ap.melspectrogram(y_hat)
            figures.update(
@ -585,7 +586,7 @@ class Wavernn(BaseVocoder):
                    f"test_{idx}/prediction": plot_spectrogram(x_hat.T),
                }
            )
-            audios.update({f"test_{idx}/audio", y_hat})
+            audios.update({f"test_{idx}/audio": y_hat})
        return figures, audios
    @staticmethod
--- a/docs/source/tts_datasets.md
+++ b/docs/source/tts_datasets.md
@ -11,6 +11,6 @@ Some of the known public datasets that we successfully applied 🐸TTS:
 - [Spanish](https://drive.google.com/file/d/1Sm_zyBo67XHkiFhcRSQ4YaHPYM0slO_e/view?usp=sharing) - thx! @carlfm01
 - [German - Thorsten OGVD](https://github.com/thorstenMueller/deep-learning-german-tts)
 - [Japanese - Kokoro](https://www.kaggle.com/kaiida/kokoro-speech-dataset-v11-small/version/1)
- [Chinese](https://www.data-baker.com/open_source.html)
+- [Chinese](https://www.data-baker.com/data/index/source/)
 Let us know if you use 🐸TTS on a different dataset.
--- a/hubconf.py
+++ b/hubconf.py
@ -1,5 +1,5 @@
 dependencies = [
-    'torch', 'gdown', 'pysbd', 'gruut', 'anyascii', 'pypinyin', 'coqpit', 'mecab-python3', 'unidic-lite`
+    'torch', 'gdown', 'pysbd', 'gruut', 'anyascii', 'pypinyin', 'coqpit', 'mecab-python3', 'unidic-lite'
 ]
 import torch
--- a/notebooks/PlotUmapLibriTTS.ipynb
+++ b/notebooks/PlotUmapLibriTTS.ipynb
--- a/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json
+++ b/recipes/ljspeech/tacotron2-DCA/tacotron2-DCA.json
@ -50,7 +50,7 @@
    "stopnet_pos_weight": 15.0,
    "run_eval": true,
    "test_delay_epochs": 10,
-    "max_decoder_steps": 50,
+    "max_decoder_steps": 1000,
    "noam_schedule": true,
    "grad_clip": 0.05,
    "epochs": 1000,
--- a/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json
+++ b/recipes/ljspeech/tacotron2-DDC/tacotron2-DDC.json
@ -56,7 +56,7 @@
    "run_eval": true,
    "test_delay_epochs": 10,
    "test_sentences_file": null,
-    "max_decoder_steps": 50,
+    "max_decoder_steps": 1000,
    "noam_schedule": true,
    "grad_clip": 0.05,
    "epochs": 1000,
--- a/tests/data_tests/test_loader.py
+++ b/tests/data_tests/test_loader.py
@ -207,7 +207,7 @@ class TestTTSDataset(unittest.TestCase):
                assert linear_input[1 - idx, -1].sum() == 0
                assert mel_input[1 - idx, -1].sum() == 0
                assert stop_target[1, mel_lengths[1] - 1] == 1
-                assert stop_target[1, mel_lengths[1] :].sum() == 0
+                assert stop_target[1, mel_lengths[1] :].sum() == stop_target.shape[1] - mel_lengths[1]
                assert len(mel_lengths.shape) == 1
                # check batch zero-frame conditions (zero-frame disabled)
--- a/tests/test_speaker_encoder.py
+++ b/tests/test_speaker_encoder.py
@ -35,7 +35,7 @@ class LSTMSpeakerEncoderTests(unittest.TestCase):
        assert abs(assert_diff) < 1e-4, f" [!] output_norm has wrong values - {assert_diff}"
        # compute d for a given batch
        dummy_input = T.rand(1, 240, 80)  # B x T x D
-        output = model.compute_embedding(dummy_input, num_frames=160, overlap=0.5)
+        output = model.compute_embedding(dummy_input, num_frames=160, num_eval=5)
        assert output.shape[0] == 1
        assert output.shape[1] == 256
        assert len(output.shape) == 2
--- a/tests/test_speaker_encoder_train.py
+++ b/tests/test_speaker_encoder_train.py
@ -6,7 +6,20 @@ from tests import get_device_id, get_tests_output_path, run_cli
 from TTS.config.shared_configs import BaseAudioConfig
 from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig
-config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
+
 def run_test_train():
    command = (
        f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
        f"--coqpit.output_path {output_path} "
        "--coqpit.datasets.0.name ljspeech "
        "--coqpit.datasets.0.meta_file_train metadata.csv "
        "--coqpit.datasets.0.meta_file_val metadata.csv "
        "--coqpit.datasets.0.path tests/data/ljspeech "
    )
    run_cli(command)
 config_path = os.path.join(get_tests_output_path(), "test_speaker_encoder_config.json")
 output_path = os.path.join(get_tests_output_path(), "train_outputs")
 config = SpeakerEncoderConfig(
@ -24,16 +37,9 @@ config.audio.do_trim_silence = True
 config.audio.trim_db = 60
 config.save_json(config_path)
 print(config)
 # train the model for one epoch
-command_train = (
+run_test_train()
    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
    f"--coqpit.output_path {output_path} "
    "--coqpit.datasets.0.name ljspeech "
    "--coqpit.datasets.0.meta_file_train metadata.csv "
    "--coqpit.datasets.0.meta_file_val metadata.csv "
    "--coqpit.datasets.0.path tests/data/ljspeech "
 )
 run_cli(command_train)
 # Find latest folder
 continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
@ -50,15 +56,7 @@ config.model_params["model_name"] = "resnet"
 config.save_json(config_path)
 # train the model for one epoch
-command_train = (
+run_test_train()
    f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
    f"--coqpit.output_path {output_path} "
    "--coqpit.datasets.0.name ljspeech "
    "--coqpit.datasets.0.meta_file_train metadata.csv "
    "--coqpit.datasets.0.meta_file_val metadata.csv "
    "--coqpit.datasets.0.path tests/data/ljspeech "
 )
 run_cli(command_train)
 # Find latest folder
 continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
@ -69,3 +67,18 @@ command_train = (
 )
 run_cli(command_train)
 shutil.rmtree(continue_path)
 # test model with ge2e loss function
 config.loss = "ge2e"
 config.save_json(config_path)
 run_test_train()
 # test model with angleproto loss function
 config.loss = "angleproto"
 config.save_json(config_path)
 run_test_train()
 # test model with softmaxproto loss function
 config.loss = "softmaxproto"
 config.save_json(config_path)
 run_test_train()