From 427c9e11001e570baa39217713e6c832affd511a Mon Sep 17 00:00:00 2001
From: jmaty <jindrich.matousek@gmail.com>
Date: Tue, 29 Mar 2022 15:32:22 +0200
Subject: [PATCH 01/32] adding no_cleaners (to avoid lowercasing input) and
 synhesize_file.py (to synthesize text from a text file)

---
 TTS/bin/synthesize_file.py     | 309 +++++++++++++++++++++++++++++++++
 TTS/tts/utils/text/cleaners.py |   5 +
 2 files changed, 314 insertions(+)
 create mode 100755 TTS/bin/synthesize_file.py
diff --git a/TTS/bin/synthesize_file.py b/TTS/bin/synthesize_file.py
new file mode 100755
index 00000000..ee910d87
--- /dev/null
+++ b/TTS/bin/synthesize_file.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import argparse
+from distutils.command.config import config
+import sys
+from argparse import RawTextHelpFormatter
+
+# pylint: disable=redefined-outer-name, unused-argument
+from pathlib import Path, PurePath
+
+sys.path.insert(0, "/storage/plzen4-ntis/home/jmatouse/GIT_repos/Coqui-TTS.mod-0.6.1")
+
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+
+
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    if v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    raise argparse.ArgumentTypeError("Boolean value expected.")
+
+
+def main():
+    # pylint: disable=bad-option-value
+    parser = argparse.ArgumentParser(
+        description="""Synthesize speech on command line.\n\n"""
+        """You can either use your trained model or choose a model from the provided list.\n\n"""
+        """If you don't specify any models, then it uses LJSpeech based English model.\n\n"""
+        """
+    # Example Runs:
+
+    ## Single Speaker Models
+
+    - list provided models
+
+    ```
+    $ ./TTS/bin/synthesize.py --list_models
+    ```
+
+    - run tts with default models.
+
+    ```
+    $ ./TTS/bin synthesize.py --text "Text for TTS"
+    ```
+
+    - run a tts model with its default vocoder model.
+
+    ```
+    $ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
+    ```
+
+    - run with specific tts and vocoder models from the list
+
+    ```
+    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
+    ```
+
+    - run your own TTS model (Using Griffin-Lim Vocoder)
+
+    ```
+    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
+    ```
+
+    - run your own TTS and Vocoder models
+    ```
+    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
+        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
+    ```
+
+    ## MULTI-SPEAKER MODELS
+
+    - list the available speakers and choose as <speaker_id> among them.
+
+    ```
+    $ ./TTS/bin/synthesize.py --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
+    ```
+
+    - run the multi-speaker TTS model with the target speaker ID.
+
+    ```
+    $ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
+    ```
+
+    - run your own multi-speaker TTS model.
+
+    ```
+    $ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
+    ```
+    """,
+        formatter_class=RawTextHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--list_models",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="list available pre-trained tts and vocoder models.",
+    )
+    parser.add_argument("--text_file", type=str, default=None, help="Text file to generate speech from.")
+
+    # Args for running pre-trained TTS models.
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="tts_models/en/ljspeech/tacotron2-DDC",
+        help="Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
+    )
+    parser.add_argument(
+        "--vocoder_name",
+        type=str,
+        default=None,
+        help="Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>",
+    )
+
+    # Args for running custom models
+    parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default=None,
+        help="Path to model file.",
+    )
+    parser.add_argument(
+        "--out_dir",
+        type=str,
+        default="",
+        help="Output wav file path directory.",
+    )
+    parser.add_argument(
+        "--out_name",
+        type=str,
+        default="utt",
+        help="Output wav filename.",
+    )
+    parser.add_argument(
+        "-1", "--use_infile_label",
+        action='store_true',
+        help="Use in-file label (1st word) as output file name",
+        default=False
+    )
+    parser.add_argument(
+        "--rm_last_word",
+        action='store_true',
+        help="Remove last word (typically corresponding to a pause)",
+        default=False
+    )
+    parser.add_argument("--use_cuda", action='store_true', help="Run model on CUDA.", default=False)
+    parser.add_argument(
+        "--vocoder_path",
+        type=str,
+        help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
+        default=None,
+    )
+    parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
+    parser.add_argument(
+        "--encoder_path",
+        type=str,
+        help="Path to speaker encoder model file.",
+        default=None,
+    )
+    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
+
+    # args for multi-speaker synthesis
+    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
+    parser.add_argument(
+        "--speaker_idx",
+        type=str,
+        help="Target speaker ID for a multi-speaker TTS model.",
+        default=None,
+    )
+    parser.add_argument(
+        "--speaker_wav",
+        nargs="+",
+        help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
+        default=None,
+    )
+    parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
+    parser.add_argument(
+        "--list_speaker_idxs",
+        help="List available speaker ids for the defined multi-speaker model.",
+        type=str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+    )
+    # aux args
+    parser.add_argument(
+        "--save_spectogram",
+        action='store_true',
+        help="If true save raw spectogram for further (vocoder) processing in out_path.",
+        default=False,
+    )
+
+    args = parser.parse_args()
+
+    # print the description if either text or list_models is not set
+    if args.text_file is None and not args.list_models and not args.list_speaker_idxs:
+        parser.parse_args(["-h"])
+
+    # load model manager
+    path = Path(__file__).parent / "../.models.json"
+    manager = ModelManager(path)
+
+    model_path = None
+    config_path = None
+    speakers_file_path = None
+    vocoder_path = None
+    vocoder_config_path = None
+    encoder_path = None
+    encoder_config_path = None
+
+    # CASE1: list pre-trained TTS models
+    if args.list_models:
+        manager.list_models()
+        sys.exit()
+
+    # CASE2: load pre-trained model paths
+    if args.model_name is not None and not args.model_path:
+        model_path, config_path, model_item = manager.download_model(args.model_name)
+        args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
+
+    if args.vocoder_name is not None and not args.vocoder_path:
+        vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
+
+    # CASE3: set custom model paths
+    # JMa: if config is not given => use config from the corresponding model/vocoder/encoder path
+    if args.model_path is not None:
+        model_path = args.model_path
+        config_path = args.config_path if args.config_path else PurePath(Path(model_path).parent, "config.json")
+        speakers_file_path = args.speakers_file_path
+
+    if args.vocoder_path is not None:
+        vocoder_path = args.vocoder_path
+        vocoder_config_path = args.vocoder_config_path if args.vocoder_config_path else PurePath(Path(vocoder_path).parent, "config.json")
+
+    if args.encoder_path is not None:
+        encoder_path = args.encoder_path
+        encoder_config_path = args.encoder_config_path if args.encoder_config_path else PurePath(Path(encoder_path).parent, "config.json")
+
+    # load models
+    synthesizer = Synthesizer(
+        model_path,
+        config_path,
+        speakers_file_path,
+        vocoder_path,
+        vocoder_config_path,
+        encoder_path,
+        encoder_config_path,
+        args.use_cuda,
+    )
+
+    # query speaker ids of a multi-speaker model.
+    if args.list_speaker_idxs:
+        print(
+            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
+        )
+        print(synthesizer.tts_model.speaker_manager.speaker_ids)
+        return
+
+    # check the arguments against a multi-speaker model.
+    if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
+        print(
+            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
+            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
+        )
+        return
+    
+    # Read lines (=sentences) from the input text file
+    with open(args.text_file, 'rt') as fr:
+        lines = fr.read().splitlines()
+    
+    # RUN THE SYNTHESIS line-by-line
+    for ix, line in enumerate(lines):
+        # Extract words
+        words = line.split()
+
+        # Use first word as utterance name?
+        if args.use_infile_label:
+            uname = words[0]
+            sent_beg = 1
+        else:
+            uname = args.out_name+str(ix)
+            sent_beg = 0
+        # Prepare output path
+        out_path = PurePath(args.out_dir, "{}.wav".format(uname))
+        
+        # Remove last word?
+        sent_end = -1 if args.rm_last_word else len(words)
+
+        # Prepare text to synthesize
+        text = " ".join(words[sent_beg:sent_end])
+        print(" > Text #{:02d}: {} --> {}".format(ix, text, out_path))
+
+        # kick it
+        wav = synthesizer.tts(text, args.speaker_idx, args.speaker_wav, args.gst_style)
+
+        # save the results
+        # print(" > Saving output to {}".format(out_path))
+        synthesizer.save_wav(wav, out_path)
+
+if __name__ == "__main__":
+    main()
diff --git a/TTS/tts/utils/text/cleaners.py b/TTS/tts/utils/text/cleaners.py
index f02f8fb4..3eff6440 100644
--- a/TTS/tts/utils/text/cleaners.py
+++ b/TTS/tts/utils/text/cleaners.py
@@ -143,3 +143,8 @@ def multilingual_cleaners(text):
     text = remove_aux_symbols(text)
     text = collapse_whitespace(text)
     return text
+
+def no_cleaners(text):
+    """JMa: Basic pipeline that only collapses whitespace. No lowercase is done!"""
+    text = collapse_whitespace(text)
+    return text

From aae77dac0748b095c701b291d8c830701c023ed5 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jmatouse@nympha.zcu.cz>
Date: Wed, 6 Apr 2022 10:06:06 +0200
Subject: [PATCH 02/32] WA: when [!] is not at the end of a sentence, it is
 used as a glottal stop in the phonetic input and sentences are NOT delimited
 by [!]

---
 TTS/utils/synthesizer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index d1abc907..392531a6 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -155,7 +155,13 @@ class Synthesizer(object):
         Returns:
             List[str]: list of sentences.
         """
-        return self.seg.segment(text)
+        # JMa
+        # WA: fix glottal stop (!): "ahoj, !", "ahoj." => "ahoj, !ahoj."
+        #     Exclamation mark (!) at the end of the sentence should not be affected.
+        # return self.seg.segment(text)
+        sents = self.seg.segment(text)
+        split_text = " ".join(sents)
+        return [split_text.replace("! ", "!")]
 
     def save_wav(self, wav: List[int], path: str) -> None:
         """Save the waveform as a file.

From 51d7ad161c47748db417c2d970fbf97e5dab21e9 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jmatouse@nympha.zcu.cz>
Date: Wed, 6 Apr 2022 14:37:51 +0200
Subject: [PATCH 03/32] Better WA for glottal stop: now works also for multiple
 sentences in a single input text

---
 TTS/utils/synthesizer.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 392531a6..4007931b 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -156,12 +156,20 @@ class Synthesizer(object):
             List[str]: list of sentences.
         """
         # JMa
-        # WA: fix glottal stop (!): "ahoj, !", "ahoj." => "ahoj, !ahoj."
-        #     Exclamation mark (!) at the end of the sentence should not be affected.
-        # return self.seg.segment(text)
-        sents = self.seg.segment(text)
-        split_text = " ".join(sents)
-        return [split_text.replace("! ", "!")]
+        if "!" in self.tts_config.characters.characters:
+            # Our proprietary phonetic mode enabled: the input text is assumed
+            # to be a sequence of phones plus punctuations (without "!") and pauses (#, $).
+            # (!) is a regular character, not a punctuation
+            # WA: Glottal stop [!] is temporarily replaced with [*] to prevent
+            # boundary detection.
+            #
+            # Example: "!ahoj, !adame." -> ["!ahoj, !", "adame."]
+            # Fix:     "!ahoj, !adame." -> ["!ahoj, !adame."]
+            text = text.replace("!", "*")
+            sents = self.seg.segment(text)
+            return [s.replace("*", "!") for s in sents]
+        else: # Original code
+            return self.seg.segment(text)
 
     def save_wav(self, wav: List[int], path: str) -> None:
         """Save the waveform as a file.

From 458512d236c60aa91622bb93dd365b0425eab063 Mon Sep 17 00:00:00 2001
From: jmaty <jindrich.matousek@gmail.com>
Date: Thu, 5 May 2022 09:24:37 +0200
Subject: [PATCH 04/32] better printing

---
 TTS/bin/synthesize_file.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TTS/bin/synthesize_file.py b/TTS/bin/synthesize_file.py
index ee910d87..704f33d3 100755
--- a/TTS/bin/synthesize_file.py
+++ b/TTS/bin/synthesize_file.py
@@ -286,7 +286,7 @@ def main():
             uname = words[0]
             sent_beg = 1
         else:
-            uname = args.out_name+str(ix)
+            uname = "{}{:03d}".format(args.out_name, ix)
             sent_beg = 0
         # Prepare output path
         out_path = PurePath(args.out_dir, "{}.wav".format(uname))
@@ -296,7 +296,7 @@ def main():
 
         # Prepare text to synthesize
         text = " ".join(words[sent_beg:sent_end])
-        print(" > Text #{:02d}: {} --> {}".format(ix, text, out_path))
+        print(" > Text #{:03d}: {} --> {}".format(ix, text, out_path))
 
         # kick it
         wav = synthesizer.tts(text, args.speaker_idx, args.speaker_wav, args.gst_style)

From c273295333545a0a678707501b6887b3c03be4e0 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <>
Date: Wed, 18 May 2022 14:54:35 +0200
Subject: [PATCH 05/32] Add the option to concatenate audio to a sinfle output
 wav file

---
 TTS/bin/synthesize_file.py | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/TTS/bin/synthesize_file.py b/TTS/bin/synthesize_file.py
index 704f33d3..aeb97a6d 100755
--- a/TTS/bin/synthesize_file.py
+++ b/TTS/bin/synthesize_file.py
@@ -139,6 +139,18 @@ def main():
         default="utt",
         help="Output wav filename.",
     )
+    parser.add_argument(
+        "--out_path",
+        type=str,
+        default="",
+        help="Output wav file path.",
+    )
+    parser.add_argument(
+        "--concat_audio",
+        action='store_true',
+        help="Concatenate audio to a single output file",
+        default=False
+    )
     parser.add_argument(
         "-1", "--use_infile_label",
         action='store_true',
@@ -276,6 +288,9 @@ def main():
     with open(args.text_file, 'rt') as fr:
         lines = fr.read().splitlines()
     
+    # Resulting wav
+    tot_wav = []
+
     # RUN THE SYNTHESIS line-by-line
     for ix, line in enumerate(lines):
         # Extract words
@@ -288,22 +303,31 @@ def main():
         else:
             uname = "{}{:03d}".format(args.out_name, ix)
             sent_beg = 0
-        # Prepare output path
-        out_path = PurePath(args.out_dir, "{}.wav".format(uname))
         
         # Remove last word?
         sent_end = -1 if args.rm_last_word else len(words)
 
         # Prepare text to synthesize
         text = " ".join(words[sent_beg:sent_end])
-        print(" > Text #{:03d}: {} --> {}".format(ix, text, out_path))
 
         # kick it
         wav = synthesizer.tts(text, args.speaker_idx, args.speaker_wav, args.gst_style)
 
-        # save the results
-        # print(" > Saving output to {}".format(out_path))
-        synthesizer.save_wav(wav, out_path)
+        # Concatenate resulting wav
+        if args.concat_audio:
+            print(" > Text #{:03d}: {}".format(ix, text))
+            tot_wav.append(wav)
+        else:
+            # Save the wav for each line 
+            # print(" > Saving output to {}".format(out_path))
+            # Prepare output path
+            out_path = PurePath(args.out_dir, "{}.wav".format(uname))
+            print(" > Text #{:03d}: {} --> {}".format(ix, text, out_path))
+            synthesizer.save_wav(wav, out_path)
+    
+    if args.concat_audio:
+        print(" > Saving audio to {}".format(args.out_path))
+        synthesizer.save_wav(tot_wav, args.out_path)
 
 if __name__ == "__main__":
     main()

From 2b4dd71d5d6c01c362149fd77ae9d9642561b3f0 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jmatouse@kky.zcu.cz>
Date: Sun, 22 May 2022 22:19:25 +0200
Subject: [PATCH 06/32] Fix concatenate audio files

---
 TTS/bin/synthesize_file.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/TTS/bin/synthesize_file.py b/TTS/bin/synthesize_file.py
index aeb97a6d..cae5092a 100755
--- a/TTS/bin/synthesize_file.py
+++ b/TTS/bin/synthesize_file.py
@@ -326,8 +326,12 @@ def main():
             synthesizer.save_wav(wav, out_path)
     
     if args.concat_audio:
+        # Concatenate resulting wav
         print(" > Saving audio to {}".format(args.out_path))
-        synthesizer.save_wav(tot_wav, args.out_path)
+        single_wav = []
+        for wav in tot_wav:
+            single_wav.extend(list(wav))
+        synthesizer.save_wav(single_wav, args.out_path)
 
 if __name__ == "__main__":
     main()

From 318e32decd747b6268728f03c17bf2245fe70423 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <>
Date: Mon, 23 May 2022 22:16:54 +0200
Subject: [PATCH 07/32] Ignore .svn

---
 .gitignore | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 2a3cbad4..bae673f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -166,4 +166,7 @@ internal/*
 *_phoneme.npy
 wandb
 depot/*
-coqui_recipes/*
\ No newline at end of file
+coqui_recipes/*
+
+# SVN
+.svn/

From 1104c47524dc131b18582fedc5130be4f1ab869c Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jmatouse@kky.zcu.cz>
Date: Wed, 25 May 2022 15:46:43 +0200
Subject: [PATCH 08/32] Change rights to executable

---
 TTS/bin/collect_env_info.py        | 0
 TTS/bin/compute_attention_masks.py | 0
 TTS/bin/compute_embeddings.py      | 0
 TTS/bin/eval_encoder.py            | 0
 TTS/bin/find_unique_chars.py       | 0
 TTS/bin/find_unique_phonemes.py    | 0
 TTS/bin/resample.py                | 0
 TTS/bin/train_encoder.py           | 0
 TTS/bin/train_tts.py               | 0
 TTS/bin/train_vocoder.py           | 0
 TTS/bin/tune_wavegrad.py           | 0
 11 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 TTS/bin/collect_env_info.py
 mode change 100644 => 100755 TTS/bin/compute_attention_masks.py
 mode change 100644 => 100755 TTS/bin/compute_embeddings.py
 mode change 100644 => 100755 TTS/bin/eval_encoder.py
 mode change 100644 => 100755 TTS/bin/find_unique_chars.py
 mode change 100644 => 100755 TTS/bin/find_unique_phonemes.py
 mode change 100644 => 100755 TTS/bin/resample.py
 mode change 100644 => 100755 TTS/bin/train_encoder.py
 mode change 100644 => 100755 TTS/bin/train_tts.py
 mode change 100644 => 100755 TTS/bin/train_vocoder.py
 mode change 100644 => 100755 TTS/bin/tune_wavegrad.py

diff --git a/TTS/bin/collect_env_info.py b/TTS/bin/collect_env_info.py
old mode 100644
new mode 100755
diff --git a/TTS/bin/compute_attention_masks.py b/TTS/bin/compute_attention_masks.py
old mode 100644
new mode 100755
diff --git a/TTS/bin/compute_embeddings.py b/TTS/bin/compute_embeddings.py
old mode 100644
new mode 100755
diff --git a/TTS/bin/eval_encoder.py b/TTS/bin/eval_encoder.py
old mode 100644
new mode 100755
diff --git a/TTS/bin/find_unique_chars.py b/TTS/bin/find_unique_chars.py
old mode 100644
new mode 100755
diff --git a/TTS/bin/find_unique_phonemes.py b/TTS/bin/find_unique_phonemes.py
old mode 100644
new mode 100755
diff --git a/TTS/bin/resample.py b/TTS/bin/resample.py
old mode 100644
new mode 100755
diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
old mode 100644
new mode 100755
diff --git a/TTS/bin/train_tts.py b/TTS/bin/train_tts.py
old mode 100644
new mode 100755
diff --git a/TTS/bin/train_vocoder.py b/TTS/bin/train_vocoder.py
old mode 100644
new mode 100755
diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py
old mode 100644
new mode 100755

From 829e2c24f973409dc8215b0436a62d14cf587107 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Tue, 21 Jun 2022 14:11:39 +0200
Subject: [PATCH 09/32] v0.7.1 (#1676)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add Thorsten VITS model (#1675)

Co-authored-by: Eren Gölge <egolge@coqui.ai>

* Remove GL message

Co-authored-by: WeberJulian <julian.weber@hotmail.fr>
---
 TTS/.models.json         | 7 +++++++
 TTS/utils/synthesizer.py | 2 --
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/TTS/.models.json b/TTS/.models.json
index 660d479c..93d9f417 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -208,6 +208,13 @@
                     "author": "@thorstenMueller",
                     "license": "apache 2.0",
                     "commit": "unknown"
+                },
+                "vits": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.7.0_models/tts_models--de--thorsten--vits.zip",
+                    "default_vocoder": null,
+                    "author": "@thorstenMueller",
+                    "license": "apache 2.0",
+                    "commit": "unknown"
                 }
             }
         },
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 9ce528a3..2f319809 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -78,8 +78,6 @@ class Synthesizer(object):
         if vocoder_checkpoint:
             self._load_vocoder(vocoder_checkpoint, vocoder_config, use_cuda)
             self.output_sample_rate = self.vocoder_config.audio["sample_rate"]
-        else:
-            print(" > Using Griffin-Lim as no vocoder model defined")
 
     @staticmethod
     def _get_segmenter(lang: str):

From d214ac1405f7574a18fe021d169e895828dc6d86 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Tue, 28 Jun 2022 15:22:04 +0200
Subject: [PATCH 10/32] fix outputs[0] coming as None proposed by
 manmay-nakhashi in https://github.com/coqui-ai/TTS/pull/1641

---
 TTS/vocoder/models/gan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py
index ed5b26dd..1bca9bdf 100644
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@@ -198,7 +198,7 @@ class GAN(BaseVocoder):
         Returns:
             Tuple[Dict, Dict]: log figures and audio samples.
         """
-        y_hat = outputs[0]["model_outputs"]
+        y_hat = outputs[0]["model_outputs"] if outputs[0] is not None else outputs[1]["model_outputs"]
         y = batch["waveform"]
         figures = plot_results(y_hat, y, ap, name)
         sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy()

From 9758971baaed77b94e5a7978d4deb8b956c8bf96 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jmatouse@kky.zcu.cz>
Date: Sun, 10 Jul 2022 11:27:02 +0200
Subject: [PATCH 11/32] Add artic formatter

---
 TTS/tts/datasets/formatters.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index ef05ea7c..ce36c819 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -556,3 +556,25 @@ def kokoro(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
             text = cols[2].replace(" ", "")
             items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
     return items
+
+
+def artic(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
+    """Normalizes the ARTIC meta data file to TTS format"""
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    speaker_name = "artic"
+    with open(txt_file, "r", encoding="utf-8") as ttf:
+        for line in ttf:
+            # Split according to standard delimiter
+            cols = line.split("|")
+            if len(cols) > 1:
+                # One or two |s are present => text is taken from the last part
+                text = cols[-1]
+            else:
+                # Assume ARTIC SNT format => wav name is delimited by the first space
+                cols = line.split(maxsplit=1)
+                text = cols[1]
+            # in either way, wav name is stored in `cols[0]`
+            wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+    return items

From 3270dda16246b23368e0876ae9b65faf44dd6313 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jmatouse@kky.zcu.cz>
Date: Sun, 10 Jul 2022 11:37:40 +0200
Subject: [PATCH 12/32] Refactor artic formatter

---
 TTS/tts/datasets/formatters.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index ce36c819..a3c40522 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -565,16 +565,16 @@ def artic(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     speaker_name = "artic"
     with open(txt_file, "r", encoding="utf-8") as ttf:
         for line in ttf:
-            # Split according to standard delimiter
-            cols = line.split("|")
-            if len(cols) > 1:
-                # One or two |s are present => text is taken from the last part
-                text = cols[-1]
+            # Check the number of standard separators
+            n_seps = line.count("|")
+            if n_seps > 0:
+                # Split according to standard separator
+                cols = line.split("|")
             else:
                 # Assume ARTIC SNT format => wav name is delimited by the first space
                 cols = line.split(maxsplit=1)
-                text = cols[1]
-            # in either way, wav name is stored in `cols[0]`
+            # In either way, wav name is stored in `cols[0]` and text in `cols[-1]`
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
+            text = cols[-1]
             items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
     return items

From 8e758ca8fec43404fb684d98771d14d18025ba9f Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jmatouse@kky.zcu.cz>
Date: Sun, 10 Jul 2022 15:24:17 +0200
Subject: [PATCH 13/32] Set speaker name to the directory name containing
 speaker's data

---
 TTS/tts/datasets/formatters.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index a3c40522..4e120bc6 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -562,7 +562,8 @@ def artic(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     """Normalizes the ARTIC meta data file to TTS format"""
     txt_file = os.path.join(root_path, meta_file)
     items = []
-    speaker_name = "artic"
+    # Speaker name is the name of the directory with the data (last part of `root_path`)
+    speaker_name = os.path.basename(os.path.normpath(root_path))
     with open(txt_file, "r", encoding="utf-8") as ttf:
         for line in ttf:
             # Check the number of standard separators

From 1896db7e2c539a1f02c6e389726530d145f89fc1 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jmatouse@kky.zcu.cz>
Date: Sun, 10 Jul 2022 22:08:11 +0200
Subject: [PATCH 14/32] Add formatter for artic multispeaker dataset

---
 TTS/tts/datasets/formatters.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 4e120bc6..eadf0529 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -579,3 +579,21 @@ def artic(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
             text = cols[-1]
             items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
     return items
+
+
+def artic_multispeaker(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+    """Normalizes the ARTIC multi-speaker meta data files to TTS format
+    
+    Args:
+        root_path (str): path to the artic dataset
+        meta_file (str): name of the meta file containing names of wav to select and
+                         transcripts of the corresponding utterances
+                         !Must be the same for all speakers!
+    Returns:
+        List[List[str]]: List of (text, wav_path, speaker_name) associated with each utterance
+    """
+    items = []
+    # Loop over speakers: speaker names are subdirs of `root_path`
+    for pth in glob(f"{root_path}/*/**/", recursive=False):
+        items.extend(artic(pth, meta_file))
+    return items

From a7d2e9b47576aca6b7621ded3c0195ec20c77505 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jmatouse@kky.zcu.cz>
Date: Sun, 10 Jul 2022 22:31:41 +0200
Subject: [PATCH 15/32] Support ignored speakers in artic multi-speaker
 formatter

---
 TTS/tts/datasets/formatters.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index eadf0529..a9216166 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -581,19 +581,26 @@ def artic(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     return items
 
 
-def artic_multispeaker(root_path, meta_file, **kwargs): # pylint: disable=unused-argument
+def artic_multispeaker(root_path, meta_file, ignored_speakers=None): # pylint: disable=unused-argument
     """Normalizes the ARTIC multi-speaker meta data files to TTS format
-    
+
     Args:
         root_path (str): path to the artic dataset
         meta_file (str): name of the meta file containing names of wav to select and
                          transcripts of the corresponding utterances
                          !Must be the same for all speakers!
+        ignore_speakers (List[str]): list of ignored speakers (or None)
+    
     Returns:
         List[List[str]]: List of (text, wav_path, speaker_name) associated with each utterance
     """
     items = []
     # Loop over speakers: speaker names are subdirs of `root_path`
     for pth in glob(f"{root_path}/*/**/", recursive=False):
+        speaker_name = os.path.basename(pth)
+        # Ignore speakers
+        if isinstance(ignored_speakers, list):
+            if speaker_name in ignored_speakers:
+                continue
         items.extend(artic(pth, meta_file))
     return items

From 61508bf33654e65851eeeea670424480ae56d596 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Wed, 20 Jul 2022 21:12:16 +0200
Subject: [PATCH 16/32] Fix artic_multispeaker formatter

---
 TTS/tts/datasets/formatters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index a9216166..d4408c79 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -596,7 +596,7 @@ def artic_multispeaker(root_path, meta_file, ignored_speakers=None): # pylint: d
     """
     items = []
     # Loop over speakers: speaker names are subdirs of `root_path`
-    for pth in glob(f"{root_path}/*/**/", recursive=False):
+    for pth in glob(f"{root_path}/*", recursive=False):
         speaker_name = os.path.basename(pth)
         # Ignore speakers
         if isinstance(ignored_speakers, list):

From af2aee5ba97334d0f18f4a52bea8a1230348b9c5 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Tue, 9 Aug 2022 11:00:06 +0200
Subject: [PATCH 17/32] Fix train_log name

---
 TTS/vocoder/models/gan.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py
index 1bca9bdf..e7dd3609 100644
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@@ -209,7 +209,7 @@ class GAN(BaseVocoder):
         self, batch: Dict, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
     ) -> Tuple[Dict, np.ndarray]:
         """Call `_log()` for training."""
-        figures, audios = self._log("eval", self.ap, batch, outputs)
+        figures, audios = self._log("train", self.ap, batch, outputs)
         logger.eval_figures(steps, figures)
         logger.eval_audios(steps, audios, self.ap.sample_rate)
 

From 946afa8197f2624fdeca148f6272560d72216488 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Mon, 22 Aug 2022 14:54:38 +0200
Subject: [PATCH 18/32] v0.8.0 (#1810)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix checkpointing GAN models (#1641)

* checkpoint sae step crash fix

* checkpoint save step crash fix

* Update gan.py

updated requested changes

* crash fix

* Fix the --model_name and --vocoder_name arguments need a <model_type> element (#1469)

Co-authored-by: Eren Gölge <erogol@hotmail.com>

* Fix Publish CI (#1597)

* Try out manylinux

* temporary removal of useless pipeline

* remove check and use only manylinux

* Try --plat-name

* Add install requirements

* Add back other actions

* Add PR trigger

* Remove conditions

* Fix sythax

* Roll back some changes

* Add other python versions

* Add test pypi upload

* Add username

* Add back __token__ as username

* Modify name of entry to testpypi

* Set it to release only

* Fix version checking

* Fix tokenizer for punc only (#1717)

* Remove redundant config field

* Fix SSIM loss

* Separate loss tests

* Fix BCELoss adressing  #1192

* Make style

* Add durations as aux input for VITS (#1694)

* Add durations as aux input for VITS

* Make style

* Fix tts_tests

* Fix test_get_aux_input

* Make lint

* feat: updated recipes and lr fix (#1718)

- updated the recipes activating more losses for more stable training
- re-enabling guided attention loss
- fixed a bug about not the correct lr fetched for logging

* Implement VitsAudioConfig (#1556)

* Implement VitsAudioConfig

* Update VITS LJSpeech recipe

* Update VITS VCTK recipe

* Make style

* Add missing decorator

* Add missing param

* Make style

* Update recipes

* Fix test

* Bug fix

* Exclude tests folder

* Make linter

* Make style

* Fix device allocation

* Fix SSIM loss correction

* Fix aux tests (#1753)

* Set n_jobs to 1 for resample script

* Delete resample test

* Set n_jobs 1 in vad test

* delete vad test

* Revert "Delete resample test"

This reverts commit bb7c8466af0832e8314ec0531290d939b2bb6565.

* Remove tests with resample

* Fix for FloorDiv Function Warning (#1760)

* Fix for Floor Function Warning

Fix for Floor Function Warning

* Adding double quotes to fix formatting

Adding double quotes to fix formatting

* Update glow_tts.py

* Update glow_tts.py

* Fix type in download_vctk.sh (#1739)

typo in comment

* Update decoder.py (#1792)

Minor comment correction.

* Update requirements.txt (#1791)

Support for #1775

* Update README.md (#1776)

Fix typo in different and code sample

* Fix & update WaveRNN vocoder model (#1749)

* Fixes KeyError bug. Adding logging to dashboard.

* Make pep8 compliant

* Make style compliant

* Still fixing style

* Fix rand_segment edge case (input_len == seg_len - 1)

* Update requirements.txt; inflect==5.6 (#1809)

New inflect version (6.0) depends on pydantic which has some issues irrelevant to 🐸 TTS. #1808
Force inflect==5.6 (pydantic free) install to solve dependency issue.

* Update README.md; download progress bar in CLI. (#1797)

* Update README.md

- minor PR
- added model_info usage guide based on #1623 in README.md .

* "added tqdm bar for model download"

* Update manage.py

* fixed style

* fixed style

* sort imports

* Update wavenet.py (#1796)

* Update wavenet.py

Current version does not use "in_channels" argument.
In glowTTS, we use normalizing flows and so "input dim" == "ouput dim" (channels and length). So, the existing code just uses hidden_channel sized tensor as input to first layer as well as outputs hidden_channel sized tensor.
However, since it is a generic implementation, I believe it is better to update it for a more general use.

* "in_channels -> hidden_channels"

* Adjust default to be able to process longer sentences (#1835)

Running `tts --text "$text" --out_path …` with a somewhat longer
sentences in the text will lead to warnings like “Decoder stopped with
max_decoder_steps 500” and the sentences just being cut off in the
resulting WAV file.

This happens quite frequently when feeding longer texts (e.g. a blog
post) to `tts`. It's particular frustrating since the error is not
always obvious in the output. You have to notice that there are missing
parts. This is something other users seem to have run into as well [1].

This patch simply increases the maximum number of steps allowed for the
tacotron decoder to fix this issue, resulting in a smoother default
behavior.

[1] https://github.com/mozilla/TTS/issues/734

* Fix language flags generated by espeak-ng phonemizer (#1801)

* fix language flags generated by espeak-ng phonemizer

* Style

* Updated language flag regex to consider all language codes alike

* fix get_random_embeddings --> get_random_embedding (#1726)

* fix get_random_embeddings --> get_random_embedding

function typo leads to training crash, no such function

* fix typo

get_random_embedding

* Introduce numpy and torch transforms (#1705)

* Refactor audio processing functions

* Add tests for numpy transforms

* Fix imports

* Fix imports2

* Implement bucketed weighted sampling for VITS (#1871)

* Update capacitron_layers.py (#1664)

crashing because of dimension miss match   at line no. 57
[batch, 256] vs [batch , 1, 512]
enc_out = torch.cat([enc_out, speaker_embedding], dim=-1)

* updates to dataset analysis notebooks for compatibility with latest version of TTS (#1853)

* Fix BCE loss issue (#1872)

* Fix BCE loss issue

* Remove import

* Remove deprecated files (#1873)

- samplers.py is moved
- distribute.py is replaces by the 👟Trainer

* Handle when no batch sampler (#1882)

* Fix tune wavegrad (#1844)

* fix imports in tune_wavegrad

* load_config returns Coqpit object instead None

* set action (store true) for flag "--use_cuda"; start to tune if module is running as the main program

* fix var order in the result of batch collating

* make style

* make style with black and isort

* Bump up to v0.8.0

* Add new DE Thorsten models (#1898)

- Tacotron2-DDC
- HifiGAN vocoder

Co-authored-by: manmay nakhashi <manmay.nakhashi@gmail.com>
Co-authored-by: camillem <camillem@users.noreply.github.com>
Co-authored-by: WeberJulian <julian.weber@hotmail.fr>
Co-authored-by: a-froghyar <adamfroghyar@gmail.com>
Co-authored-by: ivan provalov <iprovalo@yahoo.com>
Co-authored-by: Tsai Meng-Ting <sarah13680@gmail.com>
Co-authored-by: p0p4k <rajiv.punmiya@gmail.com>
Co-authored-by: Yuri Pourre <yuripourre@users.noreply.github.com>
Co-authored-by: vanIvan <alfa1211@gmail.com>
Co-authored-by: Lars Kiesow <lkiesow@uos.de>
Co-authored-by: rbaraglia <baraglia.r@live.fr>
Co-authored-by: jchai.me <jreus@users.noreply.github.com>
Co-authored-by: Stanislav Kachnov <42406556+geth-network@users.noreply.github.com>
---
 .github/workflows/pypi-release.yml            |  14 +-
 MANIFEST.in                                   |   3 +-
 README.md                                     |  46 +-
 TTS/.models.json                              |  15 +
 TTS/VERSION                                   |   2 +-
 TTS/bin/synthesize.py                         |   4 +-
 TTS/bin/train_encoder.py                      |   2 +-
 TTS/bin/tune_wavegrad.py                      | 163 +++----
 TTS/config/__init__.py                        |   2 +-
 TTS/encoder/models/resnet.py                  |   2 +-
 TTS/tts/configs/shared_configs.py             |   4 -
 TTS/tts/configs/tacotron_config.py            |   6 +-
 TTS/tts/configs/vits_config.py                |  23 +-
 TTS/tts/datasets/formatters.py                |  64 ++-
 TTS/tts/layers/generic/wavenet.py             |  11 +-
 TTS/tts/layers/glow_tts/decoder.py            |   4 +-
 TTS/tts/layers/glow_tts/glow.py               |   2 +-
 TTS/tts/layers/losses.py                      |  71 ++-
 TTS/tts/layers/tacotron/capacitron_layers.py  |   1 +
 TTS/tts/models/base_tts.py                    |   2 +-
 TTS/tts/models/glow_tts.py                    |   2 +-
 TTS/tts/models/vits.py                        | 161 +++++--
 TTS/tts/utils/helpers.py                      |  10 +-
 TTS/tts/utils/ssim.py                         | 422 ++++++++++++++---
 .../utils/text/phonemizers/espeak_wrapper.py  |   8 +
 TTS/tts/utils/text/punctuation.py             |   2 +-
 TTS/utils/audio/__init__.py                   |   1 +
 TTS/utils/audio/numpy_transforms.py           | 425 ++++++++++++++++++
 TTS/utils/{audio.py => audio/processor.py}    | 171 +------
 TTS/utils/audio/torch_transforms.py           | 163 +++++++
 TTS/utils/capacitron_optimizer.py             |   2 +
 TTS/utils/manage.py                           |  15 +-
 TTS/{encoder => }/utils/samplers.py           |  90 +++-
 TTS/utils/synthesizer.py                      |   2 +-
 TTS/vocoder/datasets/wavegrad_dataset.py      |   2 +-
 TTS/vocoder/layers/losses.py                  |   2 +-
 TTS/vocoder/models/gan.py                     |   5 +-
 TTS/vocoder/models/univnet_discriminator.py   |   2 +-
 TTS/vocoder/models/wavernn.py                 |  13 +-
 .../dataset_analysis/AnalyzeDataset.ipynb     |  21 +-
 .../dataset_analysis/PhonemeCoverage.ipynb    |  25 +-
 .../train_capacitron_t1.py                    |  12 +-
 .../train_capacitron_t2.py                    |  14 +-
 .../ljspeech/fast_pitch/train_fast_pitch.py   |   1 -
 .../ljspeech/fast_speech/train_fast_speech.py |   1 -
 .../speedy_speech/train_speedy_speech.py      |   1 -
 .../train_capacitron_t2.py                    |   1 -
 recipes/ljspeech/vits_tts/train_vits.py       |  23 +-
 .../multilingual/vits_tts/train_vits_tts.py   |  15 +-
 .../speedy_speech/train_speedy_speech.py      |   1 -
 recipes/thorsten_DE/vits_tts/train_vits.py    |  13 +-
 recipes/vctk/download_vctk.sh                 |   2 +-
 recipes/vctk/vits/train_vits.py               |  22 +-
 requirements.txt                              |   8 +-
 run_bash_tests.sh                             |   1 -
 setup.py                                      |   2 +-
 tests/aux_tests/test_audio_processor.py       |   2 +-
 tests/aux_tests/test_numpy_transforms.py      | 105 +++++
 .../test_remove_silence_vad_script.py         |  29 --
 tests/bash_tests/test_resample.sh             |  16 -
 tests/data_tests/test_samplers.py             |  30 +-
 tests/text_tests/test_tokenizer.py            |   7 +
 tests/tts_tests/test_losses.py                | 239 ++++++++++
 tests/tts_tests/test_tacotron_layers.py       | 130 ------
 tests/tts_tests/test_vits.py                  |  28 +-
 65 files changed, 1971 insertions(+), 717 deletions(-)
 create mode 100644 TTS/utils/audio/__init__.py
 create mode 100644 TTS/utils/audio/numpy_transforms.py
 rename TTS/utils/{audio.py => audio/processor.py} (84%)
 create mode 100644 TTS/utils/audio/torch_transforms.py
 rename TTS/{encoder => }/utils/samplers.py (55%)
 create mode 100644 tests/aux_tests/test_numpy_transforms.py
 delete mode 100644 tests/aux_tests/test_remove_silence_vad_script.py
 delete mode 100755 tests/bash_tests/test_resample.sh
 create mode 100644 tests/tts_tests/test_losses.py

diff --git a/.github/workflows/pypi-release.yml b/.github/workflows/pypi-release.yml
index 83797be1..fc990826 100644
--- a/.github/workflows/pypi-release.yml
+++ b/.github/workflows/pypi-release.yml
@@ -42,16 +42,18 @@ jobs:
       - uses: actions/setup-python@v2
         with:
           python-version: ${{ matrix.python-version }}
-      - run: |
+      - name: Install pip requirements
+        run: |
           python -m pip install -U pip setuptools wheel build
-      - run: |
-          python -m build
-      - run: |
-          python -m pip install dist/*.whl
+          python -m pip install -r requirements.txt
+      - name: Setup and install manylinux1_x86_64 wheel
+        run: |
+          python setup.py bdist_wheel --plat-name=manylinux1_x86_64
+          python -m pip install dist/*-manylinux*.whl
       - uses: actions/upload-artifact@v2
         with:
           name: wheel-${{ matrix.python-version }}
-          path: dist/*.whl
+          path: dist/*-manylinux*.whl
   publish-artifacts:
     runs-on: ubuntu-20.04
     needs: [build-sdist, build-wheels]
diff --git a/MANIFEST.in b/MANIFEST.in
index 82ecadcb..321d3999 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -11,4 +11,5 @@ recursive-include TTS *.md
 recursive-include TTS *.py
 recursive-include TTS *.pyx
 recursive-include images *.png
-
+recursive-exclude tests *
+prune tests*
diff --git a/README.md b/README.md
index 8ed67c30..1ca585d7 100644
--- a/README.md
+++ b/README.md
@@ -130,7 +130,7 @@ pip install -e .[all,dev,notebooks]  # Select the relevant extras
 If you are on Ubuntu (Debian), you can also run following commands for installation.
 
 ```bash
-$ make system-deps  # intended to be used on Ubuntu (Debian). Let us know if you have a diffent OS.
+$ make system-deps  # intended to be used on Ubuntu (Debian). Let us know if you have a different OS.
 $ make install
 ```
 
@@ -145,25 +145,61 @@ If you are on Windows, 👑@GuyPaddock wrote installation instructions [here](ht
     ```
     $ tts --list_models
     ```
-
+- Get model info (for both tts_models and vocoder_models):
+    - Query by type/name:
+        The model_info_by_name uses the name as it from the --list_models. 
+        ```
+        $ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
+        ```
+        For example:
+        
+        ```
+        $ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
+        ```
+        ```
+        $ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
+        ```
+    - Query by type/idx:
+        The model_query_idx uses the corresponding idx from --list_models. 
+        ```
+        $ tts --model_info_by_idx "<model_type>/<model_query_idx>"
+        ```
+        For example:
+        
+        ```
+        $ tts --model_info_by_idx tts_models/3 
+        ```
+        
 - Run TTS with default models:
 
     ```
-    $ tts --text "Text for TTS"
+    $ tts --text "Text for TTS" --out_path output/path/speech.wav
     ```
 
 - Run a TTS model with its default vocoder model:
 
     ```
-    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
+    $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
+    ```
+  For example:
+
+    ```
+    $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --out_path output/path/speech.wav
     ```
 
 - Run with specific TTS and vocoder models from the list:
 
     ```
-    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
+    $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --out_path output/path/speech.wav
     ```
 
+  For example:
+
+    ```
+    $ tts --text "Text for TTS" --model_name "tts_models/en/ljspeech/glow-tts" --vocoder_name "vocoder_models/en/ljspeech/univnet" --out_path output/path/speech.wav
+    ```
+
+
 - Run your own TTS model (Using Griffin-Lim Vocoder):
 
     ```
diff --git a/TTS/.models.json b/TTS/.models.json
index 93d9f417..e12e7a0b 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -215,6 +215,14 @@
                     "author": "@thorstenMueller",
                     "license": "apache 2.0",
                     "commit": "unknown"
+                },
+                "tacotron2-DDC": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
+                    "default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
+                    "description": "Thorsten-Dec2021-22k-DDC",
+                    "author": "@thorstenMueller",
+                    "license": "apache 2.0",
+                    "commit": "unknown"
                 }
             }
         },
@@ -460,6 +468,13 @@
                     "author": "@thorstenMueller",
                     "license": "apache 2.0",
                     "commit": "unknown"
+                },
+                "hifigan_v1": {
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
+                    "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
+                    "author": "@thorstenMueller",
+                    "license": "apache 2.0",
+                    "commit": "unknown"
                 }
             }
         },
diff --git a/TTS/VERSION b/TTS/VERSION
index 7deb86fe..8adc70fd 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.7.1
\ No newline at end of file
+0.8.0
\ No newline at end of file
diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py
index 7c609890..9a95651a 100755
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@@ -60,13 +60,13 @@ If you don't specify any models, then it uses LJSpeech based English model.
 - Run a TTS model with its default vocoder model:
 
     ```
-    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>"
+    $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>
     ```
 
 - Run with specific TTS and vocoder models from the list:
 
     ```
-    $ tts --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
+    $ tts --text "Text for TTS" --model_name "<model_type>/<language>/<dataset>/<model_name>" --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" --output_path
     ```
 
 - Run your own TTS model (Using Griffin-Lim Vocoder):
diff --git a/TTS/bin/train_encoder.py b/TTS/bin/train_encoder.py
index d28f188e..f2e7779c 100644
--- a/TTS/bin/train_encoder.py
+++ b/TTS/bin/train_encoder.py
@@ -13,13 +13,13 @@ from trainer.trainer_utils import get_optimizer
 
 from TTS.encoder.dataset import EncoderDataset
 from TTS.encoder.utils.generic_utils import save_best_model, save_checkpoint, setup_encoder_model
-from TTS.encoder.utils.samplers import PerfectBatchSampler
 from TTS.encoder.utils.training import init_training
 from TTS.encoder.utils.visual import plot_embeddings
 from TTS.tts.datasets import load_tts_samples
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.generic_utils import count_parameters, remove_experiment_folder
 from TTS.utils.io import copy_model_files
+from TTS.utils.samplers import PerfectBatchSampler
 from TTS.utils.training import check_update
 
 torch.backends.cudnn.enabled = True
diff --git a/TTS/bin/tune_wavegrad.py b/TTS/bin/tune_wavegrad.py
index a31d6c45..09582cea 100644
--- a/TTS/bin/tune_wavegrad.py
+++ b/TTS/bin/tune_wavegrad.py
@@ -1,4 +1,4 @@
-"""Search a good noise schedule for WaveGrad for a given number of inferece iterations"""
+"""Search a good noise schedule for WaveGrad for a given number of inference iterations"""
 import argparse
 from itertools import product as cartesian_product
 
@@ -7,94 +7,97 @@ import torch
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
+from TTS.config import load_config
 from TTS.utils.audio import AudioProcessor
-from TTS.utils.io import load_config
 from TTS.vocoder.datasets.preprocess import load_wav_data
 from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
-from TTS.vocoder.utils.generic_utils import setup_generator
+from TTS.vocoder.models import setup_model
 
-parser = argparse.ArgumentParser()
-parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
-parser.add_argument("--config_path", type=str, help="Path to model config file.")
-parser.add_argument("--data_path", type=str, help="Path to data directory.")
-parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
-parser.add_argument(
-    "--num_iter", type=int, help="Number of model inference iterations that you like to optimize noise schedule for."
-)
-parser.add_argument("--use_cuda", type=bool, help="enable/disable CUDA.")
-parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
-parser.add_argument(
-    "--search_depth",
-    type=int,
-    default=3,
-    help="Search granularity. Increasing this increases the run-time exponentially.",
-)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
+    parser.add_argument("--config_path", type=str, help="Path to model config file.")
+    parser.add_argument("--data_path", type=str, help="Path to data directory.")
+    parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
+    parser.add_argument(
+        "--num_iter",
+        type=int,
+        help="Number of model inference iterations that you like to optimize noise schedule for.",
+    )
+    parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
+    parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
+    parser.add_argument(
+        "--search_depth",
+        type=int,
+        default=3,
+        help="Search granularity. Increasing this increases the run-time exponentially.",
+    )
 
-# load config
-args = parser.parse_args()
-config = load_config(args.config_path)
+    # load config
+    args = parser.parse_args()
+    config = load_config(args.config_path)
 
-# setup audio processor
-ap = AudioProcessor(**config.audio)
+    # setup audio processor
+    ap = AudioProcessor(**config.audio)
 
-# load dataset
-_, train_data = load_wav_data(args.data_path, 0)
-train_data = train_data[: args.num_samples]
-dataset = WaveGradDataset(
-    ap=ap,
-    items=train_data,
-    seq_len=-1,
-    hop_len=ap.hop_length,
-    pad_short=config.pad_short,
-    conv_pad=config.conv_pad,
-    is_training=True,
-    return_segments=False,
-    use_noise_augment=False,
-    use_cache=False,
-    verbose=True,
-)
-loader = DataLoader(
-    dataset,
-    batch_size=1,
-    shuffle=False,
-    collate_fn=dataset.collate_full_clips,
-    drop_last=False,
-    num_workers=config.num_loader_workers,
-    pin_memory=False,
-)
+    # load dataset
+    _, train_data = load_wav_data(args.data_path, 0)
+    train_data = train_data[: args.num_samples]
+    dataset = WaveGradDataset(
+        ap=ap,
+        items=train_data,
+        seq_len=-1,
+        hop_len=ap.hop_length,
+        pad_short=config.pad_short,
+        conv_pad=config.conv_pad,
+        is_training=True,
+        return_segments=False,
+        use_noise_augment=False,
+        use_cache=False,
+        verbose=True,
+    )
+    loader = DataLoader(
+        dataset,
+        batch_size=1,
+        shuffle=False,
+        collate_fn=dataset.collate_full_clips,
+        drop_last=False,
+        num_workers=config.num_loader_workers,
+        pin_memory=False,
+    )
 
-# setup the model
-model = setup_generator(config)
-if args.use_cuda:
-    model.cuda()
+    # setup the model
+    model = setup_model(config)
+    if args.use_cuda:
+        model.cuda()
 
-# setup optimization parameters
-base_values = sorted(10 * np.random.uniform(size=args.search_depth))
-print(base_values)
-exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
-best_error = float("inf")
-best_schedule = None
-total_search_iter = len(base_values) ** args.num_iter
-for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
-    beta = exponents * base
-    model.compute_noise_level(beta)
-    for data in loader:
-        mel, audio = data
-        y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
+    # setup optimization parameters
+    base_values = sorted(10 * np.random.uniform(size=args.search_depth))
+    print(f" > base values: {base_values}")
+    exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
+    best_error = float("inf")
+    best_schedule = None  # pylint: disable=C0103
+    total_search_iter = len(base_values) ** args.num_iter
+    for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
+        beta = exponents * base
+        model.compute_noise_level(beta)
+        for data in loader:
+            mel, audio = data
+            y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
 
-        if args.use_cuda:
-            y_hat = y_hat.cpu()
-        y_hat = y_hat.numpy()
+            if args.use_cuda:
+                y_hat = y_hat.cpu()
+            y_hat = y_hat.numpy()
 
-        mel_hat = []
-        for i in range(y_hat.shape[0]):
-            m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
-            mel_hat.append(torch.from_numpy(m))
+            mel_hat = []
+            for i in range(y_hat.shape[0]):
+                m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
+                mel_hat.append(torch.from_numpy(m))
 
-        mel_hat = torch.stack(mel_hat)
-        mse = torch.sum((mel - mel_hat) ** 2).mean()
-        if mse.item() < best_error:
-            best_error = mse.item()
-            best_schedule = {"beta": beta}
-            print(f" > Found a better schedule. - MSE: {mse.item()}")
-            np.save(args.output_path, best_schedule)
+            mel_hat = torch.stack(mel_hat)
+            mse = torch.sum((mel - mel_hat) ** 2).mean()
+            if mse.item() < best_error:
+                best_error = mse.item()
+                best_schedule = {"beta": beta}
+                print(f" > Found a better schedule. - MSE: {mse.item()}")
+                np.save(args.output_path, best_schedule)
diff --git a/TTS/config/__init__.py b/TTS/config/__init__.py
index 6b0778c5..067c32d9 100644
--- a/TTS/config/__init__.py
+++ b/TTS/config/__init__.py
@@ -62,7 +62,7 @@ def _process_model_name(config_dict: Dict) -> str:
     return model_name
 
 
-def load_config(config_path: str) -> None:
+def load_config(config_path: str) -> Coqpit:
     """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
     to find the corresponding Config class. Then initialize the Config.
 
diff --git a/TTS/encoder/models/resnet.py b/TTS/encoder/models/resnet.py
index 84e9967f..e75ab6c4 100644
--- a/TTS/encoder/models/resnet.py
+++ b/TTS/encoder/models/resnet.py
@@ -1,7 +1,7 @@
 import torch
 from torch import nn
 
-# from TTS.utils.audio import TorchSTFT
+# from TTS.utils.audio.torch_transforms import TorchSTFT
 from TTS.encoder.models.base_encoder import BaseEncoder
 
 
diff --git a/TTS/tts/configs/shared_configs.py b/TTS/tts/configs/shared_configs.py
index 4704687c..e1ea8be3 100644
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@@ -200,9 +200,6 @@ class BaseTTSConfig(BaseTrainingConfig):
         loss_masking (bool):
             enable / disable masking loss values against padded segments of samples in a batch.
 
-        sort_by_audio_len (bool):
-            If true, dataloder sorts the data by audio length else sorts by the input text length. Defaults to `False`.
-
         min_text_len (int):
             Minimum length of input text to be used. All shorter samples will be ignored. Defaults to 0.
 
@@ -303,7 +300,6 @@ class BaseTTSConfig(BaseTrainingConfig):
     batch_group_size: int = 0
     loss_masking: bool = None
     # dataloading
-    sort_by_audio_len: bool = False
     min_audio_len: int = 1
     max_audio_len: int = float("inf")
     min_text_len: int = 1
diff --git a/TTS/tts/configs/tacotron_config.py b/TTS/tts/configs/tacotron_config.py
index e25609ff..350b5ea9 100644
--- a/TTS/tts/configs/tacotron_config.py
+++ b/TTS/tts/configs/tacotron_config.py
@@ -53,7 +53,7 @@ class TacotronConfig(BaseTTSConfig):
             enable /disable the Stopnet that predicts the end of the decoder sequence. Defaults to True.
         stopnet_pos_weight (float):
             Weight that is applied to over-weight positive instances in the Stopnet loss. Use larger values with
-            datasets with longer sentences. Defaults to 10.
+            datasets with longer sentences. Defaults to 0.2.
         max_decoder_steps (int):
             Max number of steps allowed for the decoder. Defaults to 50.
         encoder_in_features (int):
@@ -161,8 +161,8 @@ class TacotronConfig(BaseTTSConfig):
     prenet_dropout_at_inference: bool = False
     stopnet: bool = True
     separate_stopnet: bool = True
-    stopnet_pos_weight: float = 10.0
-    max_decoder_steps: int = 500
+    stopnet_pos_weight: float = 0.2
+    max_decoder_steps: int = 10000
     encoder_in_features: int = 256
     decoder_in_features: int = 256
     decoder_output_dim: int = 80
diff --git a/TTS/tts/configs/vits_config.py b/TTS/tts/configs/vits_config.py
index a8c7f91d..3469f701 100644
--- a/TTS/tts/configs/vits_config.py
+++ b/TTS/tts/configs/vits_config.py
@@ -2,7 +2,7 @@ from dataclasses import dataclass, field
 from typing import List
 
 from TTS.tts.configs.shared_configs import BaseTTSConfig
-from TTS.tts.models.vits import VitsArgs
+from TTS.tts.models.vits import VitsArgs, VitsAudioConfig
 
 
 @dataclass
@@ -16,6 +16,9 @@ class VitsConfig(BaseTTSConfig):
         model_args (VitsArgs):
             Model architecture arguments. Defaults to `VitsArgs()`.
 
+        audio (VitsAudioConfig):
+            Audio processing configuration. Defaults to `VitsAudioConfig()`.
+
         grad_clip (List):
             Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
 
@@ -67,6 +70,18 @@ class VitsConfig(BaseTTSConfig):
         compute_linear_spec (bool):
             If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
 
+        use_weighted_sampler (bool):
+            If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
+
+        weighted_sampler_attrs (dict):
+            Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
+            by overweighting `root_path` by 2.0. Defaults to `{}`.
+
+        weighted_sampler_multipliers (dict):
+            Weight each unique value of a key returned by the formatter for weighted sampling.
+            For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
+            It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
+
         r (int):
             Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
 
@@ -94,6 +109,7 @@ class VitsConfig(BaseTTSConfig):
     model: str = "vits"
     # model specific params
     model_args: VitsArgs = field(default_factory=VitsArgs)
+    audio: VitsAudioConfig = VitsAudioConfig()
 
     # optimizer
     grad_clip: List[float] = field(default_factory=lambda: [1000, 1000])
@@ -120,6 +136,11 @@ class VitsConfig(BaseTTSConfig):
     return_wav: bool = True
     compute_linear_spec: bool = True
 
+    # sampler params
+    use_weighted_sampler: bool = False  # TODO: move it to the base config
+    weighted_sampler_attrs: dict = field(default_factory=lambda: {})
+    weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
+
     # overrides
     r: int = 1  # DO NOT CHANGE
     add_blank: bool = True
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index ef05ea7c..a4be2b33 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -34,6 +34,7 @@ def coqui(root_path, meta_file, ignored_speakers=None):
                 "audio_file": audio_path,
                 "speaker_name": speaker_name if speaker_name is not None else row.speaker_name,
                 "emotion_name": emotion_name if emotion_name is not None else row.emotion_name,
+                "root_path": root_path,
             }
         )
     if not_found_counter > 0:
@@ -53,7 +54,7 @@ def tweb(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
             cols = line.split("\t")
             wav_file = os.path.join(root_path, cols[0] + ".wav")
             text = cols[1]
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
 
 
@@ -68,7 +69,7 @@ def mozilla(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
             wav_file = cols[1].strip()
             text = cols[0].strip()
             wav_file = os.path.join(root_path, "wavs", wav_file)
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
 
 
@@ -84,7 +85,7 @@ def mozilla_de(root_path, meta_file, **kwargs):  # pylint: disable=unused-argume
             text = cols[1].strip()
             folder_name = f"BATCH_{wav_file.split('_')[0]}_FINAL"
             wav_file = os.path.join(root_path, folder_name, wav_file)
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
 
 
@@ -130,7 +131,9 @@ def mailabs(root_path, meta_files=None, ignored_speakers=None):
                     wav_file = os.path.join(root_path, folder.replace("metadata.csv", ""), "wavs", cols[0] + ".wav")
                 if os.path.isfile(wav_file):
                     text = cols[1].strip()
-                    items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+                    items.append(
+                        {"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path}
+                    )
                 else:
                     # M-AI-Labs have some missing samples, so just print the warning
                     print("> File %s does not exist!" % (wav_file))
@@ -148,7 +151,7 @@ def ljspeech(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
             text = cols[2]
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
 
 
@@ -166,7 +169,9 @@ def ljspeech_test(root_path, meta_file, **kwargs):  # pylint: disable=unused-arg
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
             text = cols[2]
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": f"ljspeech-{speaker_id}"})
+            items.append(
+                {"text": text, "audio_file": wav_file, "speaker_name": f"ljspeech-{speaker_id}", "root_path": root_path}
+            )
     return items
 
 
@@ -181,7 +186,7 @@ def thorsten(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
             text = cols[1]
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
 
 
@@ -198,7 +203,7 @@ def sam_accenture(root_path, meta_file, **kwargs):  # pylint: disable=unused-arg
         if not os.path.exists(wav_file):
             print(f" [!] {wav_file} in metafile does not exist. Skipping...")
             continue
-        items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+        items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
 
 
@@ -213,7 +218,7 @@ def ruslan(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
             cols = line.split("|")
             wav_file = os.path.join(root_path, "RUSLAN", cols[0] + ".wav")
             text = cols[1]
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
 
 
@@ -261,7 +266,9 @@ def common_voice(root_path, meta_file, ignored_speakers=None):
                 if speaker_name in ignored_speakers:
                     continue
             wav_file = os.path.join(root_path, "clips", cols[1].replace(".mp3", ".wav"))
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": "MCV_" + speaker_name})
+            items.append(
+                {"text": text, "audio_file": wav_file, "speaker_name": "MCV_" + speaker_name, "root_path": root_path}
+            )
     return items
 
 
@@ -288,7 +295,14 @@ def libri_tts(root_path, meta_files=None, ignored_speakers=None):
                 if isinstance(ignored_speakers, list):
                     if speaker_name in ignored_speakers:
                         continue
-                items.append({"text": text, "audio_file": wav_file, "speaker_name": f"LTTS_{speaker_name}"})
+                items.append(
+                    {
+                        "text": text,
+                        "audio_file": wav_file,
+                        "speaker_name": f"LTTS_{speaker_name}",
+                        "root_path": root_path,
+                    }
+                )
     for item in items:
         assert os.path.exists(item["audio_file"]), f" [!] wav files don't exist - {item['audio_file']}"
     return items
@@ -307,7 +321,7 @@ def custom_turkish(root_path, meta_file, **kwargs):  # pylint: disable=unused-ar
                 skipped_files.append(wav_file)
                 continue
             text = cols[1].strip()
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     print(f" [!] {len(skipped_files)} files skipped. They don't exist...")
     return items
 
@@ -329,7 +343,7 @@ def brspeech(root_path, meta_file, ignored_speakers=None):
             if isinstance(ignored_speakers, list):
                 if speaker_id in ignored_speakers:
                     continue
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_id})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_id, "root_path": root_path})
     return items
 
 
@@ -372,7 +386,9 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic
         else:
             wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_{mic}.{file_ext}")
         if os.path.exists(wav_file):
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id})
+            items.append(
+                {"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id, "root_path": root_path}
+            )
         else:
             print(f" [!] wav files don't exist - {wav_file}")
     return items
@@ -392,7 +408,9 @@ def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=Non
         with open(meta_file, "r", encoding="utf-8") as file_text:
             text = file_text.readlines()[0]
         wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + ".wav")
-        items.append({"text": text, "audio_file": wav_file, "speaker_name": "VCTK_old_" + speaker_id})
+        items.append(
+            {"text": text, "audio_file": wav_file, "speaker_name": "VCTK_old_" + speaker_id, "root_path": root_path}
+        )
     return items
 
 
@@ -411,7 +429,7 @@ def synpaflex(root_path, metafiles=None, **kwargs):  # pylint: disable=unused-ar
         if os.path.exists(txt_file) and os.path.exists(wav_file):
             with open(txt_file, "r", encoding="utf-8") as file_text:
                 text = file_text.readlines()[0]
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
 
 
@@ -433,7 +451,7 @@ def open_bible(root_path, meta_files="train", ignore_digits_sentences=True, igno
         if ignore_digits_sentences and any(map(str.isdigit, text)):
             continue
         wav_file = os.path.join(root_path, split_dir, speaker_id, file_id + ".flac")
-        items.append({"text": text, "audio_file": wav_file, "speaker_name": "OB_" + speaker_id})
+        items.append({"text": text, "audio_file": wav_file, "speaker_name": "OB_" + speaker_id, "root_path": root_path})
     return items
 
 
@@ -450,7 +468,9 @@ def mls(root_path, meta_files=None, ignored_speakers=None):
             if isinstance(ignored_speakers, list):
                 if speaker in ignored_speakers:
                     continue
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": "MLS_" + speaker})
+            items.append(
+                {"text": text, "audio_file": wav_file, "speaker_name": "MLS_" + speaker, "root_path": root_path}
+            )
     return items
 
 
@@ -520,7 +540,9 @@ def emotion(root_path, meta_file, ignored_speakers=None):
             if isinstance(ignored_speakers, list):
                 if speaker_id in ignored_speakers:
                     continue
-            items.append({"audio_file": wav_file, "speaker_name": speaker_id, "emotion_name": emotion_id})
+            items.append(
+                {"audio_file": wav_file, "speaker_name": speaker_id, "emotion_name": emotion_id, "root_path": root_path}
+            )
     return items
 
 
@@ -540,7 +562,7 @@ def baker(root_path: str, meta_file: str, **kwargs) -> List[List[str]]:  # pylin
         for line in ttf:
             wav_name, text = line.rstrip("\n").split("|")
             wav_path = os.path.join(root_path, "clips_22", wav_name)
-            items.append({"text": text, "audio_file": wav_path, "speaker_name": speaker_name})
+            items.append({"text": text, "audio_file": wav_path, "speaker_name": speaker_name, "root_path": root_path})
     return items
 
 
@@ -554,5 +576,5 @@ def kokoro(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
             cols = line.split("|")
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
             text = cols[2].replace(" ", "")
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
diff --git a/TTS/tts/layers/generic/wavenet.py b/TTS/tts/layers/generic/wavenet.py
index aeb45c7b..613ad19d 100644
--- a/TTS/tts/layers/generic/wavenet.py
+++ b/TTS/tts/layers/generic/wavenet.py
@@ -67,9 +67,14 @@ class WN(torch.nn.Module):
         for i in range(num_layers):
             dilation = dilation_rate**i
             padding = int((kernel_size * dilation - dilation) / 2)
-            in_layer = torch.nn.Conv1d(
-                hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding
-            )
+            if i == 0:
+                in_layer = torch.nn.Conv1d(
+                    in_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding
+                )
+            else:
+                in_layer = torch.nn.Conv1d(
+                    hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding
+                )
             in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
             self.in_layers.append(in_layer)
 
diff --git a/TTS/tts/layers/glow_tts/decoder.py b/TTS/tts/layers/glow_tts/decoder.py
index f57c3731..61c5174a 100644
--- a/TTS/tts/layers/glow_tts/decoder.py
+++ b/TTS/tts/layers/glow_tts/decoder.py
@@ -29,11 +29,11 @@ def squeeze(x, x_mask=None, num_sqz=2):
 
 
 def unsqueeze(x, x_mask=None, num_sqz=2):
-    """GlowTTS unsqueeze operation
+    """GlowTTS unsqueeze operation (revert the squeeze)
 
     Note:
         each 's' is a n-dimensional vector.
-        ``[[s1, s3, s5], [s2, s4, s6]] --> [[s1, s3, s5], [s2, s4, s6]]``
+        ``[[s1, s3, s5], [s2, s4, s6]] --> [[s1, s3, s5, s2, s4, s6]]``
     """
     b, c, t = x.size()
 
diff --git a/TTS/tts/layers/glow_tts/glow.py b/TTS/tts/layers/glow_tts/glow.py
index ff1b99e8..3b745018 100644
--- a/TTS/tts/layers/glow_tts/glow.py
+++ b/TTS/tts/layers/glow_tts/glow.py
@@ -197,7 +197,7 @@ class CouplingBlock(nn.Module):
         end.bias.data.zero_()
         self.end = end
         # coupling layers
-        self.wn = WN(in_channels, hidden_channels, kernel_size, dilation_rate, num_layers, c_in_channels, dropout_p)
+        self.wn = WN(hidden_channels, hidden_channels, kernel_size, dilation_rate, num_layers, c_in_channels, dropout_p)
 
     def forward(self, x, x_mask=None, reverse=False, g=None, **kwargs):  # pylint: disable=unused-argument
         """
diff --git a/TTS/tts/layers/losses.py b/TTS/tts/layers/losses.py
index 1f0961b3..9933df6b 100644
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@@ -7,8 +7,8 @@ from torch import nn
 from torch.nn import functional
 
 from TTS.tts.utils.helpers import sequence_mask
-from TTS.tts.utils.ssim import ssim
-from TTS.utils.audio import TorchSTFT
+from TTS.tts.utils.ssim import SSIMLoss as _SSIMLoss
+from TTS.utils.audio.torch_transforms import TorchSTFT
 
 
 # pylint: disable=abstract-method
@@ -91,30 +91,55 @@ class MSELossMasked(nn.Module):
         return loss
 
 
+def sample_wise_min_max(x: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
+    """Min-Max normalize tensor through first dimension
+    Shapes:
+        - x: :math:`[B, D1, D2]`
+        - m: :math:`[B, D1, 1]`
+    """
+    maximum = torch.amax(x.masked_fill(~mask, 0), dim=(1, 2), keepdim=True)
+    minimum = torch.amin(x.masked_fill(~mask, np.inf), dim=(1, 2), keepdim=True)
+    return (x - minimum) / (maximum - minimum + 1e-8)
+
+
 class SSIMLoss(torch.nn.Module):
-    """SSIM loss as explained here https://en.wikipedia.org/wiki/Structural_similarity"""
+    """SSIM loss as (1 - SSIM)
+    SSIM is explained here https://en.wikipedia.org/wiki/Structural_similarity
+    """
 
     def __init__(self):
         super().__init__()
-        self.loss_func = ssim
+        self.loss_func = _SSIMLoss()
 
-    def forward(self, y_hat, y, length=None):
+    def forward(self, y_hat, y, length):
         """
         Args:
             y_hat (tensor): model prediction values.
             y (tensor): target values.
-            length (tensor): length of each sample in a batch.
+            length (tensor): length of each sample in a batch for masking.
+
         Shapes:
             y_hat: B x T X D
             y: B x T x D
             length: B
+
          Returns:
             loss: An average loss value in range [0, 1] masked by the length.
         """
-        if length is not None:
-            m = sequence_mask(sequence_length=length, max_len=y.size(1)).unsqueeze(2).float().to(y_hat.device)
-            y_hat, y = y_hat * m, y * m
-        return 1 - self.loss_func(y_hat.unsqueeze(1), y.unsqueeze(1))
+        mask = sequence_mask(sequence_length=length, max_len=y.size(1)).unsqueeze(2)
+        y_norm = sample_wise_min_max(y, mask)
+        y_hat_norm = sample_wise_min_max(y_hat, mask)
+        ssim_loss = self.loss_func((y_norm * mask).unsqueeze(1), (y_hat_norm * mask).unsqueeze(1))
+
+        if ssim_loss.item() > 1.0:
+            print(f" > SSIM loss is out-of-range {ssim_loss.item()}, setting it 1.0")
+            ssim_loss = torch.tensor(1.0, device=ssim_loss.device)
+
+        if ssim_loss.item() < 0.0:
+            print(f" > SSIM loss is out-of-range {ssim_loss.item()}, setting it 0.0")
+            ssim_loss = torch.tensor(0.0, device=ssim_loss.device)
+
+        return ssim_loss
 
 
 class AttentionEntropyLoss(nn.Module):
@@ -123,9 +148,6 @@ class AttentionEntropyLoss(nn.Module):
         """
         Forces attention to be more decisive by penalizing
         soft attention weights
-
-        TODO: arguments
-        TODO: unit_test
         """
         entropy = torch.distributions.Categorical(probs=align).entropy()
         loss = (entropy / np.log(align.shape[1])).mean()
@@ -133,9 +155,17 @@ class AttentionEntropyLoss(nn.Module):
 
 
 class BCELossMasked(nn.Module):
-    def __init__(self, pos_weight):
+    """BCE loss with masking.
+
+    Used mainly for stopnet in autoregressive models.
+
+    Args:
+        pos_weight (float): weight for positive samples. If set < 1, penalize early stopping. Defaults to None.
+    """
+
+    def __init__(self, pos_weight: float = None):
         super().__init__()
-        self.pos_weight = pos_weight
+        self.pos_weight = nn.Parameter(torch.tensor([pos_weight]), requires_grad=False)
 
     def forward(self, x, target, length):
         """
@@ -155,16 +185,17 @@ class BCELossMasked(nn.Module):
         Returns:
             loss: An average loss value in range [0, 1] masked by the length.
         """
-        # mask: (batch, max_len, 1)
         target.requires_grad = False
         if length is not None:
-            mask = sequence_mask(sequence_length=length, max_len=target.size(1)).float()
-            x = x * mask
-            target = target * mask
+            # mask: (batch, max_len, 1)
+            mask = sequence_mask(sequence_length=length, max_len=target.size(1))
             num_items = mask.sum()
+            loss = functional.binary_cross_entropy_with_logits(
+                x.masked_select(mask), target.masked_select(mask), pos_weight=self.pos_weight, reduction="sum"
+            )
         else:
+            loss = functional.binary_cross_entropy_with_logits(x, target, pos_weight=self.pos_weight, reduction="sum")
             num_items = torch.numel(x)
-        loss = functional.binary_cross_entropy_with_logits(x, target, pos_weight=self.pos_weight, reduction="sum")
         loss = loss / num_items
         return loss
 
diff --git a/TTS/tts/layers/tacotron/capacitron_layers.py b/TTS/tts/layers/tacotron/capacitron_layers.py
index 56fe44bc..68321358 100644
--- a/TTS/tts/layers/tacotron/capacitron_layers.py
+++ b/TTS/tts/layers/tacotron/capacitron_layers.py
@@ -53,6 +53,7 @@ class CapacitronVAE(nn.Module):
                 text_summary_out = self.text_summary_net(text_inputs, input_lengths).to(reference_mels.device)
                 enc_out = torch.cat([enc_out, text_summary_out], dim=-1)
             if speaker_embedding is not None:
+                speaker_embedding = torch.squeeze(speaker_embedding)
                 enc_out = torch.cat([enc_out, speaker_embedding], dim=-1)
 
             # Feed the output of the ref encoder and information about text/speaker into
diff --git a/TTS/tts/models/base_tts.py b/TTS/tts/models/base_tts.py
index c86bd391..df64429d 100644
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@@ -137,7 +137,7 @@ class BaseTTS(BaseTrainerModel):
         if hasattr(self, "speaker_manager"):
             if config.use_d_vector_file:
                 if speaker_name is None:
-                    d_vector = self.speaker_manager.get_random_embeddings()
+                    d_vector = self.speaker_manager.get_random_embedding()
                 else:
                     d_vector = self.speaker_manager.get_d_vector_by_name(speaker_name)
             elif config.use_speaker_embedding:
diff --git a/TTS/tts/models/glow_tts.py b/TTS/tts/models/glow_tts.py
index 7c0f95e1..cc241c43 100644
--- a/TTS/tts/models/glow_tts.py
+++ b/TTS/tts/models/glow_tts.py
@@ -514,7 +514,7 @@ class GlowTTS(BaseTTS):
             y = y[:, :, :y_max_length]
             if attn is not None:
                 attn = attn[:, :, :, :y_max_length]
-        y_lengths = (y_lengths // self.num_squeeze) * self.num_squeeze
+        y_lengths = torch.div(y_lengths, self.num_squeeze, rounding_mode="floor") * self.num_squeeze
         return y, y_lengths, y_max_length, attn
 
     def store_inverse(self):
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index a6b1c743..15fa297f 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -4,6 +4,7 @@ from dataclasses import dataclass, field, replace
 from itertools import chain
 from typing import Dict, List, Tuple, Union
 
+import numpy as np
 import torch
 import torch.distributed as dist
 import torchaudio
@@ -13,6 +14,8 @@ from torch import nn
 from torch.cuda.amp.autocast_mode import autocast
 from torch.nn import functional as F
 from torch.utils.data import DataLoader
+from torch.utils.data.sampler import WeightedRandomSampler
+from trainer.torch import DistributedSampler, DistributedSamplerWrapper
 from trainer.trainer_utils import get_optimizer, get_scheduler
 
 from TTS.tts.configs.shared_configs import CharactersConfig
@@ -29,6 +32,8 @@ from TTS.tts.utils.synthesis import synthesis
 from TTS.tts.utils.text.characters import BaseCharacters, _characters, _pad, _phonemes, _punctuations
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.tts.utils.visual import plot_alignment
+from TTS.utils.io import load_fsspec
+from TTS.utils.samplers import BucketBatchSampler
 from TTS.vocoder.models.hifigan_generator import HifiganGenerator
 from TTS.vocoder.utils.generic_utils import plot_results
 
@@ -200,11 +205,51 @@ def wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fm
     return spec
 
 
+#############################
+# CONFIGS
+#############################
+
+
+@dataclass
+class VitsAudioConfig(Coqpit):
+    fft_size: int = 1024
+    sample_rate: int = 22050
+    win_length: int = 1024
+    hop_length: int = 256
+    num_mels: int = 80
+    mel_fmin: int = 0
+    mel_fmax: int = None
+
+
 ##############################
 # DATASET
 ##############################
 
 
+def get_attribute_balancer_weights(items: list, attr_name: str, multi_dict: dict = None):
+    """Create inverse frequency weights for balancing the dataset.
+    Use `multi_dict` to scale relative weights."""
+    attr_names_samples = np.array([item[attr_name] for item in items])
+    unique_attr_names = np.unique(attr_names_samples).tolist()
+    attr_idx = [unique_attr_names.index(l) for l in attr_names_samples]
+    attr_count = np.array([len(np.where(attr_names_samples == l)[0]) for l in unique_attr_names])
+    weight_attr = 1.0 / attr_count
+    dataset_samples_weight = np.array([weight_attr[l] for l in attr_idx])
+    dataset_samples_weight = dataset_samples_weight / np.linalg.norm(dataset_samples_weight)
+    if multi_dict is not None:
+        # check if all keys are in the multi_dict
+        for k in multi_dict:
+            assert k in unique_attr_names, f"{k} not in {unique_attr_names}"
+        # scale weights
+        multiplier_samples = np.array([multi_dict.get(item[attr_name], 1.0) for item in items])
+        dataset_samples_weight *= multiplier_samples
+    return (
+        torch.from_numpy(dataset_samples_weight).float(),
+        unique_attr_names,
+        np.unique(dataset_samples_weight).tolist(),
+    )
+
+
 class VitsDataset(TTSDataset):
     def __init__(self, model_args, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -786,7 +831,7 @@ class Vits(BaseTTS):
             print(" > Text Encoder was reinit.")
 
     def get_aux_input(self, aux_input: Dict):
-        sid, g, lid = self._set_cond_input(aux_input)
+        sid, g, lid, _ = self._set_cond_input(aux_input)
         return {"speaker_ids": sid, "style_wav": None, "d_vectors": g, "language_ids": lid}
 
     def _freeze_layers(self):
@@ -817,7 +862,7 @@ class Vits(BaseTTS):
     @staticmethod
     def _set_cond_input(aux_input: Dict):
         """Set the speaker conditioning input based on the multi-speaker mode."""
-        sid, g, lid = None, None, None
+        sid, g, lid, durations = None, None, None, None
         if "speaker_ids" in aux_input and aux_input["speaker_ids"] is not None:
             sid = aux_input["speaker_ids"]
             if sid.ndim == 0:
@@ -832,7 +877,10 @@ class Vits(BaseTTS):
             if lid.ndim == 0:
                 lid = lid.unsqueeze_(0)
 
-        return sid, g, lid
+        if "durations" in aux_input and aux_input["durations"] is not None:
+            durations = aux_input["durations"]
+
+        return sid, g, lid, durations
 
     def _set_speaker_input(self, aux_input: Dict):
         d_vectors = aux_input.get("d_vectors", None)
@@ -946,7 +994,7 @@ class Vits(BaseTTS):
             - syn_spk_emb: :math:`[B, 1, speaker_encoder.proj_dim]`
         """
         outputs = {}
-        sid, g, lid = self._set_cond_input(aux_input)
+        sid, g, lid, _ = self._set_cond_input(aux_input)
         # speaker embedding
         if self.args.use_speaker_embedding and sid is not None:
             g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
@@ -1028,7 +1076,9 @@ class Vits(BaseTTS):
 
     @torch.no_grad()
     def inference(
-        self, x, aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None}
+        self,
+        x,
+        aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None, "durations": None},
     ):  # pylint: disable=dangerous-default-value
         """
         Note:
@@ -1048,7 +1098,7 @@ class Vits(BaseTTS):
             - m_p: :math:`[B, C, T_dec]`
             - logs_p: :math:`[B, C, T_dec]`
         """
-        sid, g, lid = self._set_cond_input(aux_input)
+        sid, g, lid, durations = self._set_cond_input(aux_input)
         x_lengths = self._set_x_lengths(x, aux_input)
 
         # speaker embedding
@@ -1062,21 +1112,25 @@ class Vits(BaseTTS):
 
         x, m_p, logs_p, x_mask = self.text_encoder(x, x_lengths, lang_emb=lang_emb)
 
-        if self.args.use_sdp:
-            logw = self.duration_predictor(
-                x,
-                x_mask,
-                g=g if self.args.condition_dp_on_speaker else None,
-                reverse=True,
-                noise_scale=self.inference_noise_scale_dp,
-                lang_emb=lang_emb,
-            )
+        if durations is None:
+            if self.args.use_sdp:
+                logw = self.duration_predictor(
+                    x,
+                    x_mask,
+                    g=g if self.args.condition_dp_on_speaker else None,
+                    reverse=True,
+                    noise_scale=self.inference_noise_scale_dp,
+                    lang_emb=lang_emb,
+                )
+            else:
+                logw = self.duration_predictor(
+                    x, x_mask, g=g if self.args.condition_dp_on_speaker else None, lang_emb=lang_emb
+                )
+            w = torch.exp(logw) * x_mask * self.length_scale
         else:
-            logw = self.duration_predictor(
-                x, x_mask, g=g if self.args.condition_dp_on_speaker else None, lang_emb=lang_emb
-            )
+            assert durations.shape[-1] == x.shape[-1]
+            w = durations.unsqueeze(0)
 
-        w = torch.exp(logw) * x_mask * self.length_scale
         w_ceil = torch.ceil(w)
         y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
         y_mask = sequence_mask(y_lengths, None).to(x_mask.dtype).unsqueeze(1)  # [B, 1, T_dec]
@@ -1341,7 +1395,7 @@ class Vits(BaseTTS):
         if hasattr(self, "speaker_manager"):
             if config.use_d_vector_file:
                 if speaker_name is None:
-                    d_vector = self.speaker_manager.get_random_embeddings()
+                    d_vector = self.speaker_manager.get_random_embedding()
                 else:
                     d_vector = self.speaker_manager.get_mean_embedding(speaker_name, num_samples=None, randomize=False)
             elif config.use_speaker_embedding:
@@ -1485,6 +1539,42 @@ class Vits(BaseTTS):
         batch["mel"] = batch["mel"] * sequence_mask(batch["mel_lens"]).unsqueeze(1)
         return batch
 
+    def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1, is_eval=False):
+        weights = None
+        data_items = dataset.samples
+        if getattr(config, "use_weighted_sampler", False):
+            for attr_name, alpha in config.weighted_sampler_attrs.items():
+                print(f" > Using weighted sampler for attribute '{attr_name}' with alpha '{alpha}'")
+                multi_dict = config.weighted_sampler_multipliers.get(attr_name, None)
+                print(multi_dict)
+                weights, attr_names, attr_weights = get_attribute_balancer_weights(
+                    attr_name=attr_name, items=data_items, multi_dict=multi_dict
+                )
+                weights = weights * alpha
+                print(f" > Attribute weights for '{attr_names}' \n | > {attr_weights}")
+
+        # input_audio_lenghts = [os.path.getsize(x["audio_file"]) for x in data_items]
+
+        if weights is not None:
+            w_sampler = WeightedRandomSampler(weights, len(weights))
+            batch_sampler = BucketBatchSampler(
+                w_sampler,
+                data=data_items,
+                batch_size=config.eval_batch_size if is_eval else config.batch_size,
+                sort_key=lambda x: os.path.getsize(x["audio_file"]),
+                drop_last=True,
+            )
+        else:
+            batch_sampler = None
+        # sampler for DDP
+        if batch_sampler is None:
+            batch_sampler = DistributedSampler(dataset) if num_gpus > 1 else None
+        else:  # If a sampler is already defined use this sampler and DDP sampler together
+            batch_sampler = (
+                DistributedSamplerWrapper(batch_sampler) if num_gpus > 1 else batch_sampler
+            )  # TODO: check batch_sampler with multi-gpu
+        return batch_sampler
+
     def get_data_loader(
         self,
         config: Coqpit,
@@ -1523,17 +1613,24 @@ class Vits(BaseTTS):
 
             # get samplers
             sampler = self.get_sampler(config, dataset, num_gpus)
-
-            loader = DataLoader(
-                dataset,
-                batch_size=config.eval_batch_size if is_eval else config.batch_size,
-                shuffle=False,  # shuffle is done in the dataset.
-                drop_last=False,  # setting this False might cause issues in AMP training.
-                sampler=sampler,
-                collate_fn=dataset.collate_fn,
-                num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers,
-                pin_memory=False,
-            )
+            if sampler is None:
+                loader = DataLoader(
+                    dataset,
+                    batch_size=config.eval_batch_size if is_eval else config.batch_size,
+                    shuffle=False,  # shuffle is done in the dataset.
+                    collate_fn=dataset.collate_fn,
+                    drop_last=False,  # setting this False might cause issues in AMP training.
+                    num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers,
+                    pin_memory=False,
+                )
+            else:
+                loader = DataLoader(
+                    dataset,
+                    batch_sampler=sampler,
+                    collate_fn=dataset.collate_fn,
+                    num_workers=config.num_eval_loader_workers if is_eval else config.num_loader_workers,
+                    pin_memory=False,
+                )
         return loader
 
     def get_optimizer(self) -> List:
@@ -1590,7 +1687,7 @@ class Vits(BaseTTS):
         strict=True,
     ):  # pylint: disable=unused-argument, redefined-builtin
         """Load the model checkpoint and setup for training or inference"""
-        state = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+        state = load_fsspec(checkpoint_path, map_location=torch.device("cpu"))
         # compat band-aid for the pre-trained models to not use the encoder baked into the model
         # TODO: consider baking the speaker encoder into the model and call it from there.
         # as it is probably easier for model distribution.
diff --git a/TTS/tts/utils/helpers.py b/TTS/tts/utils/helpers.py
index c2e7f561..b62004c8 100644
--- a/TTS/tts/utils/helpers.py
+++ b/TTS/tts/utils/helpers.py
@@ -76,7 +76,7 @@ def segment(x: torch.tensor, segment_indices: torch.tensor, segment_size=4, pad_
         index_start = segment_indices[i]
         index_end = index_start + segment_size
         x_i = x[i]
-        if pad_short and index_end > x.size(2):
+        if pad_short and index_end >= x.size(2):
             # pad the sample if it is shorter than the segment size
             x_i = torch.nn.functional.pad(x_i, (0, (index_end + 1) - x.size(2)))
         segments[i] = x_i[:, index_start:index_end]
@@ -107,16 +107,16 @@ def rand_segments(
             T = segment_size
     if _x_lenghts is None:
         _x_lenghts = T
-    len_diff = _x_lenghts - segment_size + 1
+    len_diff = _x_lenghts - segment_size
     if let_short_samples:
         _x_lenghts[len_diff < 0] = segment_size
-        len_diff = _x_lenghts - segment_size + 1
+        len_diff = _x_lenghts - segment_size
     else:
         assert all(
             len_diff > 0
         ), f" [!] At least one sample is shorter than the segment size ({segment_size}). \n {_x_lenghts}"
-    segment_indices = (torch.rand([B]).type_as(x) * len_diff).long()
-    ret = segment(x, segment_indices, segment_size)
+    segment_indices = (torch.rand([B]).type_as(x) * (len_diff + 1)).long()
+    ret = segment(x, segment_indices, segment_size, pad_short=pad_short)
     return ret, segment_indices
 
 
diff --git a/TTS/tts/utils/ssim.py b/TTS/tts/utils/ssim.py
index ab2c6991..4bc3befc 100644
--- a/TTS/tts/utils/ssim.py
+++ b/TTS/tts/utils/ssim.py
@@ -1,73 +1,383 @@
-# taken from https://github.com/Po-Hsun-Su/pytorch-ssim
+# Adopted from https://github.com/photosynthesis-team/piq
 
-from math import exp
+from typing import List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
-from torch.autograd import Variable
+from torch.nn.modules.loss import _Loss
 
 
-def gaussian(window_size, sigma):
-    gauss = torch.Tensor([exp(-((x - window_size // 2) ** 2) / float(2 * sigma**2)) for x in range(window_size)])
-    return gauss / gauss.sum()
+def _reduce(x: torch.Tensor, reduction: str = "mean") -> torch.Tensor:
+    r"""Reduce input in batch dimension if needed.
+    Args:
+        x: Tensor with shape (N, *).
+        reduction: Specifies the reduction type:
+            ``'none'`` | ``'mean'`` | ``'sum'``. Default: ``'mean'``
+    """
+    if reduction == "none":
+        return x
+    if reduction == "mean":
+        return x.mean(dim=0)
+    if reduction == "sum":
+        return x.sum(dim=0)
+    raise ValueError("Unknown reduction. Expected one of {'none', 'mean', 'sum'}")
 
 
-def create_window(window_size, channel):
-    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
-    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
-    window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
-    return window
+def _validate_input(
+    tensors: List[torch.Tensor],
+    dim_range: Tuple[int, int] = (0, -1),
+    data_range: Tuple[float, float] = (0.0, -1.0),
+    # size_dim_range: Tuple[float, float] = (0., -1.),
+    size_range: Optional[Tuple[int, int]] = None,
+) -> None:
+    r"""Check that input(-s)  satisfies the requirements
+    Args:
+        tensors: Tensors to check
+        dim_range: Allowed number of dimensions. (min, max)
+        data_range: Allowed range of values in tensors. (min, max)
+        size_range: Dimensions to include in size comparison. (start_dim, end_dim + 1)
+    """
 
+    if not __debug__:
+        return
 
-def _ssim(img1, img2, window, window_size, channel, size_average=True):
-    mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
-    mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
+    x = tensors[0]
 
-    # TODO: check if you need AMP disabled
-    # with torch.cuda.amp.autocast(enabled=False):
-    mu1_sq = mu1.float().pow(2)
-    mu2_sq = mu2.float().pow(2)
-    mu1_mu2 = mu1 * mu2
+    for t in tensors:
+        assert torch.is_tensor(t), f"Expected torch.Tensor, got {type(t)}"
+        assert t.device == x.device, f"Expected tensors to be on {x.device}, got {t.device}"
 
-    sigma1_sq = F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
-    sigma2_sq = F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
-    sigma12 = F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel) - mu1_mu2
-
-    C1 = 0.01**2
-    C2 = 0.03**2
-
-    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
-
-    if size_average:
-        return ssim_map.mean()
-    return ssim_map.mean(1).mean(1).mean(1)
-
-
-class SSIM(torch.nn.Module):
-    def __init__(self, window_size=11, size_average=True):
-        super().__init__()
-        self.window_size = window_size
-        self.size_average = size_average
-        self.channel = 1
-        self.window = create_window(window_size, self.channel)
-
-    def forward(self, img1, img2):
-        (_, channel, _, _) = img1.size()
-
-        if channel == self.channel and self.window.data.type() == img1.data.type():
-            window = self.window
+        if size_range is None:
+            assert t.size() == x.size(), f"Expected tensors with same size, got {t.size()} and {x.size()}"
         else:
-            window = create_window(self.window_size, channel)
-            window = window.type_as(img1)
+            assert (
+                t.size()[size_range[0] : size_range[1]] == x.size()[size_range[0] : size_range[1]]
+            ), f"Expected tensors with same size at given dimensions, got {t.size()} and {x.size()}"
 
-            self.window = window
-            self.channel = channel
+        if dim_range[0] == dim_range[1]:
+            assert t.dim() == dim_range[0], f"Expected number of dimensions to be {dim_range[0]}, got {t.dim()}"
+        elif dim_range[0] < dim_range[1]:
+            assert (
+                dim_range[0] <= t.dim() <= dim_range[1]
+            ), f"Expected number of dimensions to be between {dim_range[0]} and {dim_range[1]}, got {t.dim()}"
 
-        return _ssim(img1, img2, window, self.window_size, channel, self.size_average)
+        if data_range[0] < data_range[1]:
+            assert data_range[0] <= t.min(), f"Expected values to be greater or equal to {data_range[0]}, got {t.min()}"
+            assert t.max() <= data_range[1], f"Expected values to be lower or equal to {data_range[1]}, got {t.max()}"
 
 
-def ssim(img1, img2, window_size=11, size_average=True):
-    (_, channel, _, _) = img1.size()
-    window = create_window(window_size, channel).type_as(img1)
-    window = window.type_as(img1)
-    return _ssim(img1, img2, window, window_size, channel, size_average)
+def gaussian_filter(kernel_size: int, sigma: float) -> torch.Tensor:
+    r"""Returns 2D Gaussian kernel N(0,`sigma`^2)
+    Args:
+        size: Size of the kernel
+        sigma: Std of the distribution
+    Returns:
+        gaussian_kernel: Tensor with shape (1, kernel_size, kernel_size)
+    """
+    coords = torch.arange(kernel_size, dtype=torch.float32)
+    coords -= (kernel_size - 1) / 2.0
+
+    g = coords**2
+    g = (-(g.unsqueeze(0) + g.unsqueeze(1)) / (2 * sigma**2)).exp()
+
+    g /= g.sum()
+    return g.unsqueeze(0)
+
+
+def ssim(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    kernel_size: int = 11,
+    kernel_sigma: float = 1.5,
+    data_range: Union[int, float] = 1.0,
+    reduction: str = "mean",
+    full: bool = False,
+    downsample: bool = True,
+    k1: float = 0.01,
+    k2: float = 0.03,
+) -> List[torch.Tensor]:
+    r"""Interface of Structural Similarity (SSIM) index.
+    Inputs supposed to be in range ``[0, data_range]``.
+    To match performance with skimage and tensorflow set ``'downsample' = True``.
+
+    Args:
+        x: An input tensor. Shape :math:`(N, C, H, W)` or :math:`(N, C, H, W, 2)`.
+        y: A target tensor. Shape :math:`(N, C, H, W)` or :math:`(N, C, H, W, 2)`.
+        kernel_size: The side-length of the sliding window used in comparison. Must be an odd value.
+        kernel_sigma: Sigma of normal distribution.
+        data_range: Maximum value range of images (usually 1.0 or 255).
+        reduction: Specifies the reduction type:
+            ``'none'`` | ``'mean'`` | ``'sum'``. Default:``'mean'``
+        full: Return cs map or not.
+        downsample: Perform average pool before SSIM computation. Default: True
+        k1: Algorithm parameter, K1 (small constant).
+        k2: Algorithm parameter, K2 (small constant).
+            Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.
+
+    Returns:
+        Value of Structural Similarity (SSIM) index. In case of 5D input tensors, complex value is returned
+        as a tensor of size 2.
+
+    References:
+        Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004).
+        Image quality assessment: From error visibility to structural similarity.
+        IEEE Transactions on Image Processing, 13, 600-612.
+        https://ece.uwaterloo.ca/~z70wang/publications/ssim.pdf,
+        DOI: `10.1109/TIP.2003.819861`
+    """
+    assert kernel_size % 2 == 1, f"Kernel size must be odd, got [{kernel_size}]"
+    _validate_input([x, y], dim_range=(4, 5), data_range=(0, data_range))
+
+    x = x / float(data_range)
+    y = y / float(data_range)
+
+    # Averagepool image if the size is large enough
+    f = max(1, round(min(x.size()[-2:]) / 256))
+    if (f > 1) and downsample:
+        x = F.avg_pool2d(x, kernel_size=f)
+        y = F.avg_pool2d(y, kernel_size=f)
+
+    kernel = gaussian_filter(kernel_size, kernel_sigma).repeat(x.size(1), 1, 1, 1).to(y)
+    _compute_ssim_per_channel = _ssim_per_channel_complex if x.dim() == 5 else _ssim_per_channel
+    ssim_map, cs_map = _compute_ssim_per_channel(x=x, y=y, kernel=kernel, k1=k1, k2=k2)
+    ssim_val = ssim_map.mean(1)
+    cs = cs_map.mean(1)
+
+    ssim_val = _reduce(ssim_val, reduction)
+    cs = _reduce(cs, reduction)
+
+    if full:
+        return [ssim_val, cs]
+
+    return ssim_val
+
+
+class SSIMLoss(_Loss):
+    r"""Creates a criterion that measures the structural similarity index error between
+    each element in the input :math:`x` and target :math:`y`.
+
+    To match performance with skimage and tensorflow set ``'downsample' = True``.
+
+    The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
+
+    .. math::
+        SSIM = \{ssim_1,\dots,ssim_{N \times C}\}\\
+        ssim_{l}(x, y) = \frac{(2 \mu_x \mu_y + c_1) (2 \sigma_{xy} + c_2)}
+        {(\mu_x^2 +\mu_y^2 + c_1)(\sigma_x^2 +\sigma_y^2 + c_2)},
+
+    where :math:`N` is the batch size, `C` is the channel size. If :attr:`reduction` is not ``'none'``
+    (default ``'mean'``), then:
+
+    .. math::
+        SSIMLoss(x, y) =
+        \begin{cases}
+            \operatorname{mean}(1 - SSIM), &  \text{if reduction} = \text{'mean';}\\
+            \operatorname{sum}(1 - SSIM),  &  \text{if reduction} = \text{'sum'.}
+        \end{cases}
+
+    :math:`x` and :math:`y` are tensors of arbitrary shapes with a total
+    of :math:`n` elements each.
+
+    The sum operation still operates over all the elements, and divides by :math:`n`.
+    The division by :math:`n` can be avoided if one sets ``reduction = 'sum'``.
+    In case of 5D input tensors, complex value is returned as a tensor of size 2.
+
+    Args:
+        kernel_size: By default, the mean and covariance of a pixel is obtained
+            by convolution with given filter_size.
+        kernel_sigma: Standard deviation for Gaussian kernel.
+        k1: Coefficient related to c1 in the above equation.
+        k2: Coefficient related to c2 in the above equation.
+        downsample: Perform average pool before SSIM computation. Default: True
+        reduction: Specifies the reduction type:
+            ``'none'`` | ``'mean'`` | ``'sum'``. Default:``'mean'``
+        data_range: Maximum value range of images (usually 1.0 or 255).
+
+    Examples:
+        >>> loss = SSIMLoss()
+        >>> x = torch.rand(3, 3, 256, 256, requires_grad=True)
+        >>> y = torch.rand(3, 3, 256, 256)
+        >>> output = loss(x, y)
+        >>> output.backward()
+
+    References:
+        Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004).
+        Image quality assessment: From error visibility to structural similarity.
+        IEEE Transactions on Image Processing, 13, 600-612.
+        https://ece.uwaterloo.ca/~z70wang/publications/ssim.pdf,
+        DOI:`10.1109/TIP.2003.819861`
+    """
+    __constants__ = ["kernel_size", "k1", "k2", "sigma", "kernel", "reduction"]
+
+    def __init__(
+        self,
+        kernel_size: int = 11,
+        kernel_sigma: float = 1.5,
+        k1: float = 0.01,
+        k2: float = 0.03,
+        downsample: bool = True,
+        reduction: str = "mean",
+        data_range: Union[int, float] = 1.0,
+    ) -> None:
+        super().__init__()
+
+        # Generic loss parameters.
+        self.reduction = reduction
+
+        # Loss-specific parameters.
+        self.kernel_size = kernel_size
+
+        # This check might look redundant because kernel size is checked within the ssim function anyway.
+        # However, this check allows to fail fast when the loss is being initialised and training has not been started.
+        assert kernel_size % 2 == 1, f"Kernel size must be odd, got [{kernel_size}]"
+        self.kernel_sigma = kernel_sigma
+        self.k1 = k1
+        self.k2 = k2
+        self.downsample = downsample
+        self.data_range = data_range
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        r"""Computation of Structural Similarity (SSIM) index as a loss function.
+
+        Args:
+            x: An input tensor. Shape :math:`(N, C, H, W)` or :math:`(N, C, H, W, 2)`.
+            y: A target tensor. Shape :math:`(N, C, H, W)` or :math:`(N, C, H, W, 2)`.
+
+        Returns:
+            Value of SSIM loss to be minimized, i.e ``1 - ssim`` in [0, 1] range. In case of 5D input tensors,
+            complex value is returned as a tensor of size 2.
+        """
+
+        score = ssim(
+            x=x,
+            y=y,
+            kernel_size=self.kernel_size,
+            kernel_sigma=self.kernel_sigma,
+            downsample=self.downsample,
+            data_range=self.data_range,
+            reduction=self.reduction,
+            full=False,
+            k1=self.k1,
+            k2=self.k2,
+        )
+        return torch.ones_like(score) - score
+
+
+def _ssim_per_channel(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    kernel: torch.Tensor,
+    k1: float = 0.01,
+    k2: float = 0.03,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    r"""Calculate Structural Similarity (SSIM) index for X and Y per channel.
+
+    Args:
+        x: An input tensor. Shape :math:`(N, C, H, W)`.
+        y: A target tensor. Shape :math:`(N, C, H, W)`.
+        kernel: 2D Gaussian kernel.
+        k1: Algorithm parameter, K1 (small constant, see [1]).
+        k2: Algorithm parameter, K2 (small constant, see [1]).
+            Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.
+
+    Returns:
+        Full Value of Structural Similarity (SSIM) index.
+    """
+    if x.size(-1) < kernel.size(-1) or x.size(-2) < kernel.size(-2):
+        raise ValueError(
+            f"Kernel size can't be greater than actual input size. Input size: {x.size()}. "
+            f"Kernel size: {kernel.size()}"
+        )
+
+    c1 = k1**2
+    c2 = k2**2
+    n_channels = x.size(1)
+    mu_x = F.conv2d(x, weight=kernel, stride=1, padding=0, groups=n_channels)
+    mu_y = F.conv2d(y, weight=kernel, stride=1, padding=0, groups=n_channels)
+
+    mu_xx = mu_x**2
+    mu_yy = mu_y**2
+    mu_xy = mu_x * mu_y
+
+    sigma_xx = F.conv2d(x**2, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_xx
+    sigma_yy = F.conv2d(y**2, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_yy
+    sigma_xy = F.conv2d(x * y, weight=kernel, stride=1, padding=0, groups=n_channels) - mu_xy
+
+    # Contrast sensitivity (CS) with alpha = beta = gamma = 1.
+    cs = (2.0 * sigma_xy + c2) / (sigma_xx + sigma_yy + c2)
+
+    # Structural similarity (SSIM)
+    ss = (2.0 * mu_xy + c1) / (mu_xx + mu_yy + c1) * cs
+
+    ssim_val = ss.mean(dim=(-1, -2))
+    cs = cs.mean(dim=(-1, -2))
+    return ssim_val, cs
+
+
+def _ssim_per_channel_complex(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    kernel: torch.Tensor,
+    k1: float = 0.01,
+    k2: float = 0.03,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    r"""Calculate Structural Similarity (SSIM) index for Complex X and Y per channel.
+
+    Args:
+        x: An input tensor. Shape :math:`(N, C, H, W, 2)`.
+        y: A target tensor. Shape :math:`(N, C, H, W, 2)`.
+        kernel: 2-D gauss kernel.
+        k1: Algorithm parameter, K1 (small constant, see [1]).
+        k2: Algorithm parameter, K2 (small constant, see [1]).
+            Try a larger K2 constant (e.g. 0.4) if you get a negative or NaN results.
+
+    Returns:
+        Full Value of Complex Structural Similarity (SSIM) index.
+    """
+    n_channels = x.size(1)
+    if x.size(-2) < kernel.size(-1) or x.size(-3) < kernel.size(-2):
+        raise ValueError(
+            f"Kernel size can't be greater than actual input size. Input size: {x.size()}. "
+            f"Kernel size: {kernel.size()}"
+        )
+
+    c1 = k1**2
+    c2 = k2**2
+
+    x_real = x[..., 0]
+    x_imag = x[..., 1]
+    y_real = y[..., 0]
+    y_imag = y[..., 1]
+
+    mu1_real = F.conv2d(x_real, weight=kernel, stride=1, padding=0, groups=n_channels)
+    mu1_imag = F.conv2d(x_imag, weight=kernel, stride=1, padding=0, groups=n_channels)
+    mu2_real = F.conv2d(y_real, weight=kernel, stride=1, padding=0, groups=n_channels)
+    mu2_imag = F.conv2d(y_imag, weight=kernel, stride=1, padding=0, groups=n_channels)
+
+    mu1_sq = mu1_real.pow(2) + mu1_imag.pow(2)
+    mu2_sq = mu2_real.pow(2) + mu2_imag.pow(2)
+    mu1_mu2_real = mu1_real * mu2_real - mu1_imag * mu2_imag
+    mu1_mu2_imag = mu1_real * mu2_imag + mu1_imag * mu2_real
+
+    compensation = 1.0
+
+    x_sq = x_real.pow(2) + x_imag.pow(2)
+    y_sq = y_real.pow(2) + y_imag.pow(2)
+    x_y_real = x_real * y_real - x_imag * y_imag
+    x_y_imag = x_real * y_imag + x_imag * y_real
+
+    sigma1_sq = F.conv2d(x_sq, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_sq
+    sigma2_sq = F.conv2d(y_sq, weight=kernel, stride=1, padding=0, groups=n_channels) - mu2_sq
+    sigma12_real = F.conv2d(x_y_real, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_mu2_real
+    sigma12_imag = F.conv2d(x_y_imag, weight=kernel, stride=1, padding=0, groups=n_channels) - mu1_mu2_imag
+    sigma12 = torch.stack((sigma12_imag, sigma12_real), dim=-1)
+    mu1_mu2 = torch.stack((mu1_mu2_real, mu1_mu2_imag), dim=-1)
+    # Set alpha = beta = gamma = 1.
+    cs_map = (sigma12 * 2 + c2 * compensation) / (sigma1_sq.unsqueeze(-1) + sigma2_sq.unsqueeze(-1) + c2 * compensation)
+    ssim_map = (mu1_mu2 * 2 + c1 * compensation) / (mu1_sq.unsqueeze(-1) + mu2_sq.unsqueeze(-1) + c1 * compensation)
+    ssim_map = ssim_map * cs_map
+
+    ssim_val = ssim_map.mean(dim=(-2, -3))
+    cs = cs_map.mean(dim=(-2, -3))
+
+    return ssim_val, cs
diff --git a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
index 024f79c6..281da221 100644
--- a/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
+++ b/TTS/tts/utils/text/phonemizers/espeak_wrapper.py
@@ -1,4 +1,5 @@
 import logging
+import re
 import subprocess
 from typing import Dict, List
 
@@ -163,6 +164,13 @@ class ESpeak(BasePhonemizer):
 
             # dealing with the conditions descrived above
             ph_decoded = ph_decoded[:1].replace("_", "") + ph_decoded[1:]
+
+            # espeak-ng backend can add language flags that need to be removed:
+            #   "sɛʁtˈɛ̃ mˈo kɔm (en)fˈʊtbɔːl(fr) ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ."
+            # phonemize needs to remove the language flags of the returned text:
+            #   "sɛʁtˈɛ̃ mˈo kɔm fˈʊtbɔːl ʒenˈɛʁ de- flˈaɡ də- lˈɑ̃ɡ."
+            ph_decoded = re.sub(r"\(.+?\)", "", ph_decoded)
+
             phonemes += ph_decoded.strip()
         return phonemes.replace("_", separator)
 
diff --git a/TTS/tts/utils/text/punctuation.py b/TTS/tts/utils/text/punctuation.py
index b2a058bb..8d199cc5 100644
--- a/TTS/tts/utils/text/punctuation.py
+++ b/TTS/tts/utils/text/punctuation.py
@@ -137,7 +137,7 @@ class Punctuation:
 
         # nothing have been phonemized, returns the puncs alone
         if not text:
-            return ["".join(m.mark for m in puncs)]
+            return ["".join(m.punc for m in puncs)]
 
         current = puncs[0]
 
diff --git a/TTS/utils/audio/__init__.py b/TTS/utils/audio/__init__.py
new file mode 100644
index 00000000..f18f2219
--- /dev/null
+++ b/TTS/utils/audio/__init__.py
@@ -0,0 +1 @@
+from TTS.utils.audio.processor import AudioProcessor
diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py
new file mode 100644
index 00000000..f6f03855
--- /dev/null
+++ b/TTS/utils/audio/numpy_transforms.py
@@ -0,0 +1,425 @@
+from typing import Tuple
+
+import librosa
+import numpy as np
+import pyworld as pw
+import scipy
+import soundfile as sf
+
+# For using kwargs
+# pylint: disable=unused-argument
+
+
+def build_mel_basis(
+    *,
+    sample_rate: int = None,
+    fft_size: int = None,
+    num_mels: int = None,
+    mel_fmax: int = None,
+    mel_fmin: int = None,
+    **kwargs,
+) -> np.ndarray:
+    """Build melspectrogram basis.
+
+    Returns:
+        np.ndarray: melspectrogram basis.
+    """
+    if mel_fmax is not None:
+        assert mel_fmax <= sample_rate // 2
+        assert mel_fmax - mel_fmin > 0
+    return librosa.filters.mel(sr=sample_rate, n_fft=fft_size, n_mels=num_mels, fmin=mel_fmin, fmax=mel_fmax)
+
+
+def millisec_to_length(
+    *, frame_length_ms: int = None, frame_shift_ms: int = None, sample_rate: int = None, **kwargs
+) -> Tuple[int, int]:
+    """Compute hop and window length from milliseconds.
+
+    Returns:
+        Tuple[int, int]: hop length and window length for STFT.
+    """
+    factor = frame_length_ms / frame_shift_ms
+    assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
+    win_length = int(frame_length_ms / 1000.0 * sample_rate)
+    hop_length = int(win_length / float(factor))
+    return win_length, hop_length
+
+
+def _log(x, base):
+    if base == 10:
+        return np.log10(x)
+    return np.log(x)
+
+
+def _exp(x, base):
+    if base == 10:
+        return np.power(10, x)
+    return np.exp(x)
+
+
+def amp_to_db(*, x: np.ndarray = None, gain: float = 1, base: int = 10, **kwargs) -> np.ndarray:
+    """Convert amplitude values to decibels.
+
+    Args:
+        x (np.ndarray): Amplitude spectrogram.
+        gain (float): Gain factor. Defaults to 1.
+        base (int): Logarithm base. Defaults to 10.
+
+    Returns:
+        np.ndarray: Decibels spectrogram.
+    """
+    assert (x < 0).sum() == 0, " [!] Input values must be non-negative."
+    return gain * _log(np.maximum(1e-8, x), base)
+
+
+# pylint: disable=no-self-use
+def db_to_amp(*, x: np.ndarray = None, gain: float = 1, base: int = 10, **kwargs) -> np.ndarray:
+    """Convert decibels spectrogram to amplitude spectrogram.
+
+    Args:
+        x (np.ndarray): Decibels spectrogram.
+        gain (float): Gain factor. Defaults to 1.
+        base (int): Logarithm base. Defaults to 10.
+
+    Returns:
+        np.ndarray: Amplitude spectrogram.
+    """
+    return _exp(x / gain, base)
+
+
+def preemphasis(*, x: np.ndarray, coef: float = 0.97, **kwargs) -> np.ndarray:
+    """Apply pre-emphasis to the audio signal. Useful to reduce the correlation between neighbouring signal values.
+
+    Args:
+        x (np.ndarray): Audio signal.
+
+    Raises:
+        RuntimeError: Preemphasis coeff is set to 0.
+
+    Returns:
+        np.ndarray: Decorrelated audio signal.
+    """
+    if coef == 0:
+        raise RuntimeError(" [!] Preemphasis is set 0.0.")
+    return scipy.signal.lfilter([1, -coef], [1], x)
+
+
+def deemphasis(*, x: np.ndarray = None, coef: float = 0.97, **kwargs) -> np.ndarray:
+    """Reverse pre-emphasis."""
+    if coef == 0:
+        raise RuntimeError(" [!] Preemphasis is set 0.0.")
+    return scipy.signal.lfilter([1], [1, -coef], x)
+
+
+def spec_to_mel(*, spec: np.ndarray, mel_basis: np.ndarray = None, **kwargs) -> np.ndarray:
+    """Convert a full scale linear spectrogram output of a network to a melspectrogram.
+
+    Args:
+        spec (np.ndarray): Normalized full scale linear spectrogram.
+
+    Shapes:
+        - spec: :math:`[C, T]`
+
+    Returns:
+        np.ndarray: Normalized melspectrogram.
+    """
+    return np.dot(mel_basis, spec)
+
+
+def mel_to_spec(*, mel: np.ndarray = None, mel_basis: np.ndarray = None, **kwargs) -> np.ndarray:
+    """Convert a melspectrogram to full scale spectrogram."""
+    assert (mel < 0).sum() == 0, " [!] Input values must be non-negative."
+    inv_mel_basis = np.linalg.pinv(mel_basis)
+    return np.maximum(1e-10, np.dot(inv_mel_basis, mel))
+
+
+def wav_to_spec(*, wav: np.ndarray = None, **kwargs) -> np.ndarray:
+    """Compute a spectrogram from a waveform.
+
+    Args:
+        wav (np.ndarray): Waveform. Shape :math:`[T_wav,]`
+
+    Returns:
+        np.ndarray: Spectrogram. Shape :math:`[C, T_spec]`. :math:`T_spec == T_wav / hop_length`
+    """
+    D = stft(y=wav, **kwargs)
+    S = np.abs(D)
+    return S.astype(np.float32)
+
+
+def wav_to_mel(*, wav: np.ndarray = None, mel_basis=None, **kwargs) -> np.ndarray:
+    """Compute a melspectrogram from a waveform."""
+    D = stft(y=wav, **kwargs)
+    S = spec_to_mel(spec=np.abs(D), mel_basis=mel_basis, **kwargs)
+    return S.astype(np.float32)
+
+
+def spec_to_wav(*, spec: np.ndarray, power: float = 1.5, **kwargs) -> np.ndarray:
+    """Convert a spectrogram to a waveform using Griffi-Lim vocoder."""
+    S = spec.copy()
+    return griffin_lim(spec=S**power, **kwargs)
+
+
+def mel_to_wav(*, mel: np.ndarray = None, power: float = 1.5, **kwargs) -> np.ndarray:
+    """Convert a melspectrogram to a waveform using Griffi-Lim vocoder."""
+    S = mel.copy()
+    S = mel_to_spec(mel=S, mel_basis=kwargs["mel_basis"])  # Convert back to linear
+    return griffin_lim(spec=S**power, **kwargs)
+
+
+### STFT and ISTFT ###
+def stft(
+    *,
+    y: np.ndarray = None,
+    fft_size: int = None,
+    hop_length: int = None,
+    win_length: int = None,
+    pad_mode: str = "reflect",
+    window: str = "hann",
+    center: bool = True,
+    **kwargs,
+) -> np.ndarray:
+    """Librosa STFT wrapper.
+
+    Check http://librosa.org/doc/main/generated/librosa.stft.html argument details.
+
+    Returns:
+        np.ndarray: Complex number array.
+    """
+    return librosa.stft(
+        y=y,
+        n_fft=fft_size,
+        hop_length=hop_length,
+        win_length=win_length,
+        pad_mode=pad_mode,
+        window=window,
+        center=center,
+    )
+
+
+def istft(
+    *,
+    y: np.ndarray = None,
+    fft_size: int = None,
+    hop_length: int = None,
+    win_length: int = None,
+    window: str = "hann",
+    center: bool = True,
+    **kwargs,
+) -> np.ndarray:
+    """Librosa iSTFT wrapper.
+
+    Check http://librosa.org/doc/main/generated/librosa.istft.html argument details.
+
+    Returns:
+        np.ndarray: Complex number array.
+    """
+    return librosa.istft(y, hop_length=hop_length, win_length=win_length, center=center, window=window)
+
+
+def griffin_lim(*, spec: np.ndarray = None, num_iter=60, **kwargs) -> np.ndarray:
+    angles = np.exp(2j * np.pi * np.random.rand(*spec.shape))
+    S_complex = np.abs(spec).astype(np.complex)
+    y = istft(y=S_complex * angles, **kwargs)
+    if not np.isfinite(y).all():
+        print(" [!] Waveform is not finite everywhere. Skipping the GL.")
+        return np.array([0.0])
+    for _ in range(num_iter):
+        angles = np.exp(1j * np.angle(stft(y=y, **kwargs)))
+        y = istft(y=S_complex * angles, **kwargs)
+    return y
+
+
+def compute_stft_paddings(
+    *, x: np.ndarray = None, hop_length: int = None, pad_two_sides: bool = False, **kwargs
+) -> Tuple[int, int]:
+    """Compute paddings used by Librosa's STFT. Compute right padding (final frame) or both sides padding
+    (first and final frames)"""
+    pad = (x.shape[0] // hop_length + 1) * hop_length - x.shape[0]
+    if not pad_two_sides:
+        return 0, pad
+    return pad // 2, pad // 2 + pad % 2
+
+
+def compute_f0(
+    *, x: np.ndarray = None, pitch_fmax: float = None, hop_length: int = None, sample_rate: int = None, **kwargs
+) -> np.ndarray:
+    """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.
+
+    Args:
+        x (np.ndarray): Waveform. Shape :math:`[T_wav,]`
+
+    Returns:
+        np.ndarray: Pitch. Shape :math:`[T_pitch,]`. :math:`T_pitch == T_wav / hop_length`
+
+    Examples:
+        >>> WAV_FILE = filename = librosa.util.example_audio_file()
+        >>> from TTS.config import BaseAudioConfig
+        >>> from TTS.utils.audio.processor import AudioProcessor        >>> conf = BaseAudioConfig(pitch_fmax=8000)
+        >>> ap = AudioProcessor(**conf)
+        >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
+        >>> pitch = ap.compute_f0(wav)
+    """
+    assert pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
+
+    f0, t = pw.dio(
+        x.astype(np.double),
+        fs=sample_rate,
+        f0_ceil=pitch_fmax,
+        frame_period=1000 * hop_length / sample_rate,
+    )
+    f0 = pw.stonemask(x.astype(np.double), f0, t, sample_rate)
+    return f0
+
+
+### Audio Processing ###
+def find_endpoint(
+    *,
+    wav: np.ndarray = None,
+    trim_db: float = -40,
+    sample_rate: int = None,
+    min_silence_sec=0.8,
+    gain: float = None,
+    base: int = None,
+    **kwargs,
+) -> int:
+    """Find the last point without silence at the end of a audio signal.
+
+    Args:
+        wav (np.ndarray): Audio signal.
+        threshold_db (int, optional): Silence threshold in decibels. Defaults to -40.
+        min_silence_sec (float, optional): Ignore silences that are shorter then this in secs. Defaults to 0.8.
+        gian (float, optional): Gain to be used to convert trim_db to trim_amp. Defaults to None.
+        base (int, optional): Base of the logarithm used to convert trim_db to trim_amp. Defaults to 10.
+
+    Returns:
+        int: Last point without silence.
+    """
+    window_length = int(sample_rate * min_silence_sec)
+    hop_length = int(window_length / 4)
+    threshold = db_to_amp(x=-trim_db, gain=gain, base=base)
+    for x in range(hop_length, len(wav) - window_length, hop_length):
+        if np.max(wav[x : x + window_length]) < threshold:
+            return x + hop_length
+    return len(wav)
+
+
+def trim_silence(
+    *,
+    wav: np.ndarray = None,
+    sample_rate: int = None,
+    trim_db: float = None,
+    win_length: int = None,
+    hop_length: int = None,
+    **kwargs,
+) -> np.ndarray:
+    """Trim silent parts with a threshold and 0.01 sec margin"""
+    margin = int(sample_rate * 0.01)
+    wav = wav[margin:-margin]
+    return librosa.effects.trim(wav, top_db=trim_db, frame_length=win_length, hop_length=hop_length)[0]
+
+
+def volume_norm(*, x: np.ndarray = None, coef: float = 0.95, **kwargs) -> np.ndarray:
+    """Normalize the volume of an audio signal.
+
+    Args:
+        x (np.ndarray): Raw waveform.
+        coef (float): Coefficient to rescale the maximum value. Defaults to 0.95.
+
+    Returns:
+        np.ndarray: Volume normalized waveform.
+    """
+    return x / abs(x).max() * coef
+
+
+def rms_norm(*, wav: np.ndarray = None, db_level: float = -27.0, **kwargs) -> np.ndarray:
+    r = 10 ** (db_level / 20)
+    a = np.sqrt((len(wav) * (r**2)) / np.sum(wav**2))
+    return wav * a
+
+
+def rms_volume_norm(*, x: np.ndarray, db_level: float = -27.0, **kwargs) -> np.ndarray:
+    """Normalize the volume based on RMS of the signal.
+
+    Args:
+        x (np.ndarray): Raw waveform.
+        db_level (float): Target dB level in RMS. Defaults to -27.0.
+
+    Returns:
+        np.ndarray: RMS normalized waveform.
+    """
+    assert -99 <= db_level <= 0, " [!] db_level should be between -99 and 0"
+    wav = rms_norm(wav=x, db_level=db_level)
+    return wav
+
+
+def load_wav(*, filename: str, sample_rate: int = None, resample: bool = False, **kwargs) -> np.ndarray:
+    """Read a wav file using Librosa and optionally resample, silence trim, volume normalize.
+
+    Resampling slows down loading the file significantly. Therefore it is recommended to resample the file before.
+
+    Args:
+        filename (str): Path to the wav file.
+        sr (int, optional): Sampling rate for resampling. Defaults to None.
+        resample (bool, optional): Resample the audio file when loading. Slows down the I/O time. Defaults to False.
+
+    Returns:
+        np.ndarray: Loaded waveform.
+    """
+    if resample:
+        # loading with resampling. It is significantly slower.
+        x, _ = librosa.load(filename, sr=sample_rate)
+    else:
+        # SF is faster than librosa for loading files
+        x, _ = sf.read(filename)
+    return x
+
+
+def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, **kwargs) -> None:
+    """Save float waveform to a file using Scipy.
+
+    Args:
+        wav (np.ndarray): Waveform with float values in range [-1, 1] to save.
+        path (str): Path to a output file.
+        sr (int, optional): Sampling rate used for saving to the file. Defaults to None.
+    """
+    wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
+    scipy.io.wavfile.write(path, sample_rate, wav_norm.astype(np.int16))
+
+
+def mulaw_encode(*, wav: np.ndarray, mulaw_qc: int, **kwargs) -> np.ndarray:
+    mu = 2**mulaw_qc - 1
+    signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1.0 + mu)
+    signal = (signal + 1) / 2 * mu + 0.5
+    return np.floor(
+        signal,
+    )
+
+
+def mulaw_decode(*, wav, mulaw_qc: int, **kwargs) -> np.ndarray:
+    """Recovers waveform from quantized values."""
+    mu = 2**mulaw_qc - 1
+    x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
+    return x
+
+
+def encode_16bits(*, x: np.ndarray, **kwargs) -> np.ndarray:
+    return np.clip(x * 2**15, -(2**15), 2**15 - 1).astype(np.int16)
+
+
+def quantize(*, x: np.ndarray, quantize_bits: int, **kwargs) -> np.ndarray:
+    """Quantize a waveform to a given number of bits.
+
+    Args:
+        x (np.ndarray): Waveform to quantize. Must be normalized into the range `[-1, 1]`.
+        quantize_bits (int): Number of quantization bits.
+
+    Returns:
+        np.ndarray: Quantized waveform.
+    """
+    return (x + 1.0) * (2**quantize_bits - 1) / 2
+
+
+def dequantize(*, x, quantize_bits, **kwargs) -> np.ndarray:
+    """Dequantize a waveform from the given number of bits."""
+    return 2 * x / (2**quantize_bits - 1) - 1
diff --git a/TTS/utils/audio.py b/TTS/utils/audio/processor.py
similarity index 84%
rename from TTS/utils/audio.py
rename to TTS/utils/audio/processor.py
index fc9d1942..5a63b444 100644
--- a/TTS/utils/audio.py
+++ b/TTS/utils/audio/processor.py
@@ -6,179 +6,14 @@ import pyworld as pw
 import scipy.io.wavfile
 import scipy.signal
 import soundfile as sf
-import torch
-from torch import nn
 
 from TTS.tts.utils.helpers import StandardScaler
 
-
-class TorchSTFT(nn.Module):  # pylint: disable=abstract-method
-    """Some of the audio processing funtions using Torch for faster batch processing.
-
-    TODO: Merge this with audio.py
-
-    Args:
-
-        n_fft (int):
-            FFT window size for STFT.
-
-        hop_length (int):
-            number of frames between STFT columns.
-
-        win_length (int, optional):
-            STFT window length.
-
-        pad_wav (bool, optional):
-            If True pad the audio with (n_fft - hop_length) / 2). Defaults to False.
-
-        window (str, optional):
-            The name of a function to create a window tensor that is applied/multiplied to each frame/window. Defaults to "hann_window"
-
-        sample_rate (int, optional):
-            target audio sampling rate. Defaults to None.
-
-        mel_fmin (int, optional):
-            minimum filter frequency for computing melspectrograms. Defaults to None.
-
-        mel_fmax (int, optional):
-            maximum filter frequency for computing melspectrograms. Defaults to None.
-
-        n_mels (int, optional):
-            number of melspectrogram dimensions. Defaults to None.
-
-        use_mel (bool, optional):
-            If True compute the melspectrograms otherwise. Defaults to False.
-
-        do_amp_to_db_linear (bool, optional):
-            enable/disable amplitude to dB conversion of linear spectrograms. Defaults to False.
-
-        spec_gain (float, optional):
-            gain applied when converting amplitude to DB. Defaults to 1.0.
-
-        power (float, optional):
-            Exponent for the magnitude spectrogram, e.g., 1 for energy, 2 for power, etc.  Defaults to None.
-
-        use_htk (bool, optional):
-            Use HTK formula in mel filter instead of Slaney.
-
-        mel_norm (None, 'slaney', or number, optional):
-            If 'slaney', divide the triangular mel weights by the width of the mel band
-            (area normalization).
-
-            If numeric, use `librosa.util.normalize` to normalize each filter by to unit l_p norm.
-            See `librosa.util.normalize` for a full description of supported norm values
-            (including `+-np.inf`).
-
-            Otherwise, leave all the triangles aiming for a peak value of 1.0. Defaults to "slaney".
-    """
-
-    def __init__(
-        self,
-        n_fft,
-        hop_length,
-        win_length,
-        pad_wav=False,
-        window="hann_window",
-        sample_rate=None,
-        mel_fmin=0,
-        mel_fmax=None,
-        n_mels=80,
-        use_mel=False,
-        do_amp_to_db=False,
-        spec_gain=1.0,
-        power=None,
-        use_htk=False,
-        mel_norm="slaney",
-    ):
-        super().__init__()
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.win_length = win_length
-        self.pad_wav = pad_wav
-        self.sample_rate = sample_rate
-        self.mel_fmin = mel_fmin
-        self.mel_fmax = mel_fmax
-        self.n_mels = n_mels
-        self.use_mel = use_mel
-        self.do_amp_to_db = do_amp_to_db
-        self.spec_gain = spec_gain
-        self.power = power
-        self.use_htk = use_htk
-        self.mel_norm = mel_norm
-        self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False)
-        self.mel_basis = None
-        if use_mel:
-            self._build_mel_basis()
-
-    def __call__(self, x):
-        """Compute spectrogram frames by torch based stft.
-
-        Args:
-            x (Tensor): input waveform
-
-        Returns:
-            Tensor: spectrogram frames.
-
-        Shapes:
-            x: [B x T] or [:math:`[B, 1, T]`]
-        """
-        if x.ndim == 2:
-            x = x.unsqueeze(1)
-        if self.pad_wav:
-            padding = int((self.n_fft - self.hop_length) / 2)
-            x = torch.nn.functional.pad(x, (padding, padding), mode="reflect")
-        # B x D x T x 2
-        o = torch.stft(
-            x.squeeze(1),
-            self.n_fft,
-            self.hop_length,
-            self.win_length,
-            self.window,
-            center=True,
-            pad_mode="reflect",  # compatible with audio.py
-            normalized=False,
-            onesided=True,
-            return_complex=False,
-        )
-        M = o[:, :, :, 0]
-        P = o[:, :, :, 1]
-        S = torch.sqrt(torch.clamp(M**2 + P**2, min=1e-8))
-
-        if self.power is not None:
-            S = S**self.power
-
-        if self.use_mel:
-            S = torch.matmul(self.mel_basis.to(x), S)
-        if self.do_amp_to_db:
-            S = self._amp_to_db(S, spec_gain=self.spec_gain)
-        return S
-
-    def _build_mel_basis(self):
-        mel_basis = librosa.filters.mel(
-            self.sample_rate,
-            self.n_fft,
-            n_mels=self.n_mels,
-            fmin=self.mel_fmin,
-            fmax=self.mel_fmax,
-            htk=self.use_htk,
-            norm=self.mel_norm,
-        )
-        self.mel_basis = torch.from_numpy(mel_basis).float()
-
-    @staticmethod
-    def _amp_to_db(x, spec_gain=1.0):
-        return torch.log(torch.clamp(x, min=1e-5) * spec_gain)
-
-    @staticmethod
-    def _db_to_amp(x, spec_gain=1.0):
-        return torch.exp(x) / spec_gain
-
-
 # pylint: disable=too-many-public-methods
-class AudioProcessor(object):
-    """Audio Processor for TTS used by all the data pipelines.
 
-    TODO: Make this a dataclass to replace `BaseAudioConfig`.
+
+class AudioProcessor(object):
+    """Audio Processor for TTS.
 
     Note:
         All the class arguments are set to default values to enable a flexible initialization
diff --git a/TTS/utils/audio/torch_transforms.py b/TTS/utils/audio/torch_transforms.py
new file mode 100644
index 00000000..d4523ad0
--- /dev/null
+++ b/TTS/utils/audio/torch_transforms.py
@@ -0,0 +1,163 @@
+import librosa
+import torch
+from torch import nn
+
+
+class TorchSTFT(nn.Module):  # pylint: disable=abstract-method
+    """Some of the audio processing funtions using Torch for faster batch processing.
+
+    Args:
+
+        n_fft (int):
+            FFT window size for STFT.
+
+        hop_length (int):
+            number of frames between STFT columns.
+
+        win_length (int, optional):
+            STFT window length.
+
+        pad_wav (bool, optional):
+            If True pad the audio with (n_fft - hop_length) / 2). Defaults to False.
+
+        window (str, optional):
+            The name of a function to create a window tensor that is applied/multiplied to each frame/window. Defaults to "hann_window"
+
+        sample_rate (int, optional):
+            target audio sampling rate. Defaults to None.
+
+        mel_fmin (int, optional):
+            minimum filter frequency for computing melspectrograms. Defaults to None.
+
+        mel_fmax (int, optional):
+            maximum filter frequency for computing melspectrograms. Defaults to None.
+
+        n_mels (int, optional):
+            number of melspectrogram dimensions. Defaults to None.
+
+        use_mel (bool, optional):
+            If True compute the melspectrograms otherwise. Defaults to False.
+
+        do_amp_to_db_linear (bool, optional):
+            enable/disable amplitude to dB conversion of linear spectrograms. Defaults to False.
+
+        spec_gain (float, optional):
+            gain applied when converting amplitude to DB. Defaults to 1.0.
+
+        power (float, optional):
+            Exponent for the magnitude spectrogram, e.g., 1 for energy, 2 for power, etc.  Defaults to None.
+
+        use_htk (bool, optional):
+            Use HTK formula in mel filter instead of Slaney.
+
+        mel_norm (None, 'slaney', or number, optional):
+            If 'slaney', divide the triangular mel weights by the width of the mel band
+            (area normalization).
+
+            If numeric, use `librosa.util.normalize` to normalize each filter by to unit l_p norm.
+            See `librosa.util.normalize` for a full description of supported norm values
+            (including `+-np.inf`).
+
+            Otherwise, leave all the triangles aiming for a peak value of 1.0. Defaults to "slaney".
+    """
+
+    def __init__(
+        self,
+        n_fft,
+        hop_length,
+        win_length,
+        pad_wav=False,
+        window="hann_window",
+        sample_rate=None,
+        mel_fmin=0,
+        mel_fmax=None,
+        n_mels=80,
+        use_mel=False,
+        do_amp_to_db=False,
+        spec_gain=1.0,
+        power=None,
+        use_htk=False,
+        mel_norm="slaney",
+    ):
+        super().__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.pad_wav = pad_wav
+        self.sample_rate = sample_rate
+        self.mel_fmin = mel_fmin
+        self.mel_fmax = mel_fmax
+        self.n_mels = n_mels
+        self.use_mel = use_mel
+        self.do_amp_to_db = do_amp_to_db
+        self.spec_gain = spec_gain
+        self.power = power
+        self.use_htk = use_htk
+        self.mel_norm = mel_norm
+        self.window = nn.Parameter(getattr(torch, window)(win_length), requires_grad=False)
+        self.mel_basis = None
+        if use_mel:
+            self._build_mel_basis()
+
+    def __call__(self, x):
+        """Compute spectrogram frames by torch based stft.
+
+        Args:
+            x (Tensor): input waveform
+
+        Returns:
+            Tensor: spectrogram frames.
+
+        Shapes:
+            x: [B x T] or [:math:`[B, 1, T]`]
+        """
+        if x.ndim == 2:
+            x = x.unsqueeze(1)
+        if self.pad_wav:
+            padding = int((self.n_fft - self.hop_length) / 2)
+            x = torch.nn.functional.pad(x, (padding, padding), mode="reflect")
+        # B x D x T x 2
+        o = torch.stft(
+            x.squeeze(1),
+            self.n_fft,
+            self.hop_length,
+            self.win_length,
+            self.window,
+            center=True,
+            pad_mode="reflect",  # compatible with audio.py
+            normalized=False,
+            onesided=True,
+            return_complex=False,
+        )
+        M = o[:, :, :, 0]
+        P = o[:, :, :, 1]
+        S = torch.sqrt(torch.clamp(M**2 + P**2, min=1e-8))
+
+        if self.power is not None:
+            S = S**self.power
+
+        if self.use_mel:
+            S = torch.matmul(self.mel_basis.to(x), S)
+        if self.do_amp_to_db:
+            S = self._amp_to_db(S, spec_gain=self.spec_gain)
+        return S
+
+    def _build_mel_basis(self):
+        mel_basis = librosa.filters.mel(
+            self.sample_rate,
+            self.n_fft,
+            n_mels=self.n_mels,
+            fmin=self.mel_fmin,
+            fmax=self.mel_fmax,
+            htk=self.use_htk,
+            norm=self.mel_norm,
+        )
+        self.mel_basis = torch.from_numpy(mel_basis).float()
+
+    @staticmethod
+    def _amp_to_db(x, spec_gain=1.0):
+        return torch.log(torch.clamp(x, min=1e-5) * spec_gain)
+
+    @staticmethod
+    def _db_to_amp(x, spec_gain=1.0):
+        return torch.exp(x) / spec_gain
diff --git a/TTS/utils/capacitron_optimizer.py b/TTS/utils/capacitron_optimizer.py
index c9f075af..fac7d8a0 100644
--- a/TTS/utils/capacitron_optimizer.py
+++ b/TTS/utils/capacitron_optimizer.py
@@ -34,6 +34,8 @@ class CapacitronOptimizer:
         self.primary_optimizer.zero_grad()
 
     def step(self):
+        # Update param groups to display the correct learning rate
+        self.param_groups = self.primary_optimizer.param_groups
         self.primary_optimizer.step()
 
     def zero_grad(self):
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index 281e5af0..5eed6683 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -1,4 +1,3 @@
-import io
 import json
 import os
 import zipfile
@@ -7,6 +6,7 @@ from shutil import copyfile, rmtree
 from typing import Dict, Tuple
 
 import requests
+from tqdm import tqdm
 
 from TTS.config import load_config
 from TTS.utils.generic_utils import get_user_data_dir
@@ -337,11 +337,20 @@ class ModelManager(object):
     def _download_zip_file(file_url, output_folder):
         """Download the github releases"""
         # download the file
-        r = requests.get(file_url)
+        r = requests.get(file_url, stream=True)
         # extract the file
         try:
-            with zipfile.ZipFile(io.BytesIO(r.content)) as z:
+            total_size_in_bytes = int(r.headers.get("content-length", 0))
+            block_size = 1024  # 1 Kibibyte
+            progress_bar = tqdm(total=total_size_in_bytes, unit="iB", unit_scale=True)
+            temp_zip_name = os.path.join(output_folder, file_url.split("/")[-1])
+            with open(temp_zip_name, "wb") as file:
+                for data in r.iter_content(block_size):
+                    progress_bar.update(len(data))
+                    file.write(data)
+            with zipfile.ZipFile(temp_zip_name) as z:
                 z.extractall(output_folder)
+            os.remove(temp_zip_name)  # delete zip after extract
         except zipfile.BadZipFile:
             print(f" > Error: Bad zip file - {file_url}")
             raise zipfile.BadZipFile  # pylint: disable=raise-missing-from
diff --git a/TTS/encoder/utils/samplers.py b/TTS/utils/samplers.py
similarity index 55%
rename from TTS/encoder/utils/samplers.py
rename to TTS/utils/samplers.py
index 08256b34..df5d4185 100644
--- a/TTS/encoder/utils/samplers.py
+++ b/TTS/utils/samplers.py
@@ -1,6 +1,8 @@
+import math
 import random
+from typing import Callable, List, Union
 
-from torch.utils.data.sampler import Sampler, SubsetRandomSampler
+from torch.utils.data.sampler import BatchSampler, Sampler, SubsetRandomSampler
 
 
 class SubsetSampler(Sampler):
@@ -112,3 +114,89 @@ class PerfectBatchSampler(Sampler):
     def __len__(self):
         class_batch_size = self._batch_size // self._num_classes_in_batch
         return min(((len(s) + class_batch_size - 1) // class_batch_size) for s in self._samplers)
+
+
+def identity(x):
+    return x
+
+
+class SortedSampler(Sampler):
+    """Samples elements sequentially, always in the same order.
+
+    Taken from https://github.com/PetrochukM/PyTorch-NLP
+
+    Args:
+        data (iterable): Iterable data.
+        sort_key (callable): Specifies a function of one argument that is used to extract a
+            numerical comparison key from each list element.
+
+    Example:
+        >>> list(SortedSampler(range(10), sort_key=lambda i: -i))
+        [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+
+    """
+
+    def __init__(self, data, sort_key: Callable = identity):
+        super().__init__(data)
+        self.data = data
+        self.sort_key = sort_key
+        zip_ = [(i, self.sort_key(row)) for i, row in enumerate(self.data)]
+        zip_ = sorted(zip_, key=lambda r: r[1])
+        self.sorted_indexes = [item[0] for item in zip_]
+
+    def __iter__(self):
+        return iter(self.sorted_indexes)
+
+    def __len__(self):
+        return len(self.data)
+
+
+class BucketBatchSampler(BatchSampler):
+    """Bucket batch sampler
+
+    Adapted from https://github.com/PetrochukM/PyTorch-NLP
+
+    Args:
+        sampler (torch.data.utils.sampler.Sampler):
+        batch_size (int): Size of mini-batch.
+        drop_last (bool): If `True` the sampler will drop the last batch if its size would be less
+            than `batch_size`.
+        data (list): List of data samples.
+        sort_key (callable, optional): Callable to specify a comparison key for sorting.
+        bucket_size_multiplier (int, optional): Buckets are of size
+            `batch_size * bucket_size_multiplier`.
+
+    Example:
+        >>> sampler = WeightedRandomSampler(weights, len(weights))
+        >>> sampler = BucketBatchSampler(sampler, data=data_items, batch_size=32, drop_last=True)
+    """
+
+    def __init__(
+        self,
+        sampler,
+        data,
+        batch_size,
+        drop_last,
+        sort_key: Union[Callable, List] = identity,
+        bucket_size_multiplier=100,
+    ):
+        super().__init__(sampler, batch_size, drop_last)
+        self.data = data
+        self.sort_key = sort_key
+        _bucket_size = batch_size * bucket_size_multiplier
+        if hasattr(sampler, "__len__"):
+            _bucket_size = min(_bucket_size, len(sampler))
+        self.bucket_sampler = BatchSampler(sampler, _bucket_size, False)
+
+    def __iter__(self):
+        for idxs in self.bucket_sampler:
+            bucket_data = [self.data[idx] for idx in idxs]
+            sorted_sampler = SortedSampler(bucket_data, self.sort_key)
+            for batch_idx in SubsetRandomSampler(list(BatchSampler(sorted_sampler, self.batch_size, self.drop_last))):
+                sorted_idxs = [idxs[i] for i in batch_idx]
+                yield sorted_idxs
+
+    def __len__(self):
+        if self.drop_last:
+            return len(self.sampler) // self.batch_size
+        return math.ceil(len(self.sampler) / self.batch_size)
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 2f319809..170bb223 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -307,7 +307,7 @@ class Synthesizer(object):
                 waveform = waveform.squeeze()
 
                 # trim silence
-                if self.tts_config.audio["do_trim_silence"] is True:
+                if "do_trim_silence" in self.tts_config.audio and self.tts_config.audio["do_trim_silence"]:
                     waveform = trim_silence(waveform, self.tts_model.ap)
 
                 wavs += list(waveform)
diff --git a/TTS/vocoder/datasets/wavegrad_dataset.py b/TTS/vocoder/datasets/wavegrad_dataset.py
index 05e0fae8..d941eab3 100644
--- a/TTS/vocoder/datasets/wavegrad_dataset.py
+++ b/TTS/vocoder/datasets/wavegrad_dataset.py
@@ -149,4 +149,4 @@ class WaveGradDataset(Dataset):
             mels[idx, :, : mel.shape[1]] = mel
             audios[idx, : audio.shape[0]] = audio
 
-        return audios, mels
+        return mels, audios
diff --git a/TTS/vocoder/layers/losses.py b/TTS/vocoder/layers/losses.py
index 848e292b..befc43cc 100644
--- a/TTS/vocoder/layers/losses.py
+++ b/TTS/vocoder/layers/losses.py
@@ -4,7 +4,7 @@ import torch
 from torch import nn
 from torch.nn import functional as F
 
-from TTS.utils.audio import TorchSTFT
+from TTS.utils.audio.torch_transforms import TorchSTFT
 from TTS.vocoder.utils.distribution import discretized_mix_logistic_loss, gaussian_loss
 
 #################################
diff --git a/TTS/vocoder/models/gan.py b/TTS/vocoder/models/gan.py
index ed5b26dd..a3803f77 100644
--- a/TTS/vocoder/models/gan.py
+++ b/TTS/vocoder/models/gan.py
@@ -185,8 +185,7 @@ class GAN(BaseVocoder):
             outputs = {"model_outputs": self.y_hat_g}
         return outputs, loss_dict
 
-    @staticmethod
-    def _log(name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, Dict]:
+    def _log(self, name: str, ap: AudioProcessor, batch: Dict, outputs: Dict) -> Tuple[Dict, Dict]:
         """Logging shared by the training and evaluation.
 
         Args:
@@ -198,7 +197,7 @@ class GAN(BaseVocoder):
         Returns:
             Tuple[Dict, Dict]: log figures and audio samples.
         """
-        y_hat = outputs[0]["model_outputs"]
+        y_hat = outputs[0]["model_outputs"] if self.train_disc else outputs[1]["model_outputs"]
         y = batch["waveform"]
         figures = plot_results(y_hat, y, ap, name)
         sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy()
diff --git a/TTS/vocoder/models/univnet_discriminator.py b/TTS/vocoder/models/univnet_discriminator.py
index d6b0e5d5..34e2d1c2 100644
--- a/TTS/vocoder/models/univnet_discriminator.py
+++ b/TTS/vocoder/models/univnet_discriminator.py
@@ -3,7 +3,7 @@ import torch.nn.functional as F
 from torch import nn
 from torch.nn.utils import spectral_norm, weight_norm
 
-from TTS.utils.audio import TorchSTFT
+from TTS.utils.audio.torch_transforms import TorchSTFT
 from TTS.vocoder.models.hifigan_discriminator import MultiPeriodDiscriminator
 
 LRELU_SLOPE = 0.1
diff --git a/TTS/vocoder/models/wavernn.py b/TTS/vocoder/models/wavernn.py
index 6686db45..e0a25e32 100644
--- a/TTS/vocoder/models/wavernn.py
+++ b/TTS/vocoder/models/wavernn.py
@@ -233,6 +233,7 @@ class Wavernn(BaseVocoder):
         else:
             raise RuntimeError("Unknown model mode value - ", self.args.mode)
 
+        self.ap = AudioProcessor(**config.audio.to_dict())
         self.aux_dims = self.args.res_out_dims // 4
 
         if self.args.use_upsample_net:
@@ -571,7 +572,7 @@ class Wavernn(BaseVocoder):
     def test(
         self, assets: Dict, test_loader: "DataLoader", output: Dict  # pylint: disable=unused-argument
     ) -> Tuple[Dict, Dict]:
-        ap = assets["audio_processor"]
+        ap = self.ap
         figures = {}
         audios = {}
         samples = test_loader.dataset.load_test_samples(1)
@@ -587,8 +588,16 @@ class Wavernn(BaseVocoder):
                 }
             )
             audios.update({f"test_{idx}/audio": y_hat})
+            # audios.update({f"real_{idx}/audio": y_hat})
         return figures, audios
 
+    def test_log(
+        self, outputs: Dict, logger: "Logger", assets: Dict, steps: int  # pylint: disable=unused-argument
+    ) -> Tuple[Dict, np.ndarray]:
+        figures, audios = outputs
+        logger.eval_figures(steps, figures)
+        logger.eval_audios(steps, audios, self.ap.sample_rate)
+
     @staticmethod
     def format_batch(batch: Dict) -> Dict:
         waveform = batch[0]
@@ -605,7 +614,7 @@ class Wavernn(BaseVocoder):
         verbose: bool,
         num_gpus: int,
     ):
-        ap = assets["audio_processor"]
+        ap = self.ap
         dataset = WaveRNNDataset(
             ap=ap,
             items=samples,
diff --git a/notebooks/dataset_analysis/AnalyzeDataset.ipynb b/notebooks/dataset_analysis/AnalyzeDataset.ipynb
index 51963847..7fc51a3a 100644
--- a/notebooks/dataset_analysis/AnalyzeDataset.ipynb
+++ b/notebooks/dataset_analysis/AnalyzeDataset.ipynb
@@ -45,7 +45,7 @@
    "source": [
     "NUM_PROC = 8\n",
     "DATASET_CONFIG = BaseDatasetConfig(\n",
-    "    name=\"ljspeech\", meta_file_train=\"metadata.csv\", path=\"/home/ubuntu/TTS/depot/data/male_dataset1_44k/\"\n",
+    "    name=\"ljspeech\", meta_file_train=\"metadata.csv\", path=\"/absolute/path/to/your/dataset/\"\n",
     ")"
    ]
   },
@@ -58,13 +58,13 @@
     "def formatter(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument\n",
     "    txt_file = os.path.join(root_path, meta_file)\n",
     "    items = []\n",
-    "    speaker_name = \"maledataset1\"\n",
+    "    speaker_name = \"myspeaker\"\n",
     "    with open(txt_file, \"r\", encoding=\"utf-8\") as ttf:\n",
     "        for line in ttf:\n",
     "            cols = line.split(\"|\")\n",
-    "            wav_file = os.path.join(root_path, \"wavs\", cols[0])\n",
+    "            wav_file = os.path.join(root_path, \"wavs\", cols[0] + \".wav\")            \n",
     "            text = cols[1]\n",
-    "            items.append([text, wav_file, speaker_name])\n",
+    "            items.append({\"text\": text, \"audio_file\": wav_file, \"speaker_name\": speaker_name})\n",
     "    return items"
    ]
   },
@@ -78,7 +78,10 @@
    "source": [
     "# use your own preprocessor at this stage - TTS/datasets/proprocess.py\n",
     "train_samples, eval_samples = load_tts_samples(DATASET_CONFIG, eval_split=True, formatter=formatter)\n",
-    "items = train_samples + eval_samples\n",
+    "if eval_samples is not None:\n",
+    "    items = train_samples + eval_samples\n",
+    "else:\n",
+    "    items = train_samples\n",
     "print(\" > Number of audio files: {}\".format(len(items)))\n",
     "print(items[1])"
    ]
@@ -94,7 +97,7 @@
     "# check wavs if exist\n",
     "wav_files = []\n",
     "for item in items:\n",
-    "    wav_file = item[1].strip()\n",
+    "    wav_file = item[\"audio_file\"].strip()\n",
     "    wav_files.append(wav_file)\n",
     "    if not os.path.exists(wav_file):\n",
     "        print(waf_path)"
@@ -131,8 +134,8 @@
    "outputs": [],
    "source": [
     "def load_item(item):\n",
-    "    text = item[0].strip()\n",
-    "    file_name = item[1].strip()\n",
+    "    text = item[\"text\"].strip()\n",
+    "    file_name = item[\"audio_file\"].strip()\n",
     "    audio, sr = librosa.load(file_name, sr=None)\n",
     "    audio_len = len(audio) / sr\n",
     "    text_len = len(text)\n",
@@ -416,7 +419,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.5"
+   "version": "3.9.12"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/dataset_analysis/PhonemeCoverage.ipynb b/notebooks/dataset_analysis/PhonemeCoverage.ipynb
index 2b7f5d67..d481ed29 100644
--- a/notebooks/dataset_analysis/PhonemeCoverage.ipynb
+++ b/notebooks/dataset_analysis/PhonemeCoverage.ipynb
@@ -37,7 +37,7 @@
     "# set some vars\n",
     "# TTS_PATH = \"/home/thorsten/___dev/tts/mozilla/TTS\"\n",
     "CONFIG_FILE = \"/path/to/config/config.json\"\n",
-    "CHARS_TO_REMOVE = \".,:!?'\""
+    "CHARS_TO_REMOVE = \".,:!?'\"\n"
    ]
   },
   {
@@ -59,7 +59,8 @@
     "# extra imports that might not be included in requirements.txt\n",
     "import collections\n",
     "import operator\n",
-    "\n"
+    "\n",
+    "%matplotlib inline"
    ]
   },
   {
@@ -75,7 +76,7 @@
     "CONFIG = load_config(CONFIG_FILE)\n",
     "\n",
     "# Load some properties from config.json\n",
-    "CONFIG_METADATA = sorted(load_tts_samples(CONFIG.datasets)[0])\n",
+    "CONFIG_METADATA = load_tts_samples(CONFIG.datasets)[0]\n",
     "CONFIG_METADATA = CONFIG_METADATA\n",
     "CONFIG_DATASET = CONFIG.datasets[0]\n",
     "CONFIG_PHONEME_LANGUAGE = CONFIG.phoneme_language\n",
@@ -84,7 +85,10 @@
     "\n",
     "# Will be printed on generated output graph\n",
     "CONFIG_RUN_NAME = CONFIG.run_name\n",
-    "CONFIG_RUN_DESC = CONFIG.run_description"
+    "CONFIG_RUN_DESC = CONFIG.run_description\n",
+    "\n",
+    "# Needed to convert text to phonemes and phonemes to ids\n",
+    "tokenizer, config = TTSTokenizer.init_from_config(CONFIG)"
    ]
   },
   {
@@ -112,12 +116,13 @@
    "source": [
     "def get_phoneme_from_sequence(text):\n",
     "    temp_list = []\n",
-    "    if len(text[0]) > 0:\n",
-    "        temp_text = text[0].rstrip('\\n')\n",
+    "    if len(text[\"text\"]) > 0:\n",
+    "        #temp_text = text[0].rstrip('\\n')\n",
+    "        temp_text = text[\"text\"].rstrip('\\n')\n",
     "        for rm_bad_chars in CHARS_TO_REMOVE:\n",
     "            temp_text = temp_text.replace(rm_bad_chars,\"\")\n",
-    "        seq = phoneme_to_sequence(temp_text, [CONFIG_TEXT_CLEANER], CONFIG_PHONEME_LANGUAGE, CONFIG_ENABLE_EOS_BOS_CHARS)\n",
-    "        text = sequence_to_phoneme(seq)\n",
+    "        seq = tokenizer.text_to_ids(temp_text)\n",
+    "        text = tokenizer.ids_to_text(seq)\n",
     "        text = text.replace(\" \",\"\")\n",
     "        temp_list.append(text)\n",
     "    return temp_list"
@@ -229,7 +234,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -243,7 +248,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.9.12"
   }
  },
  "nbformat": 4,
diff --git a/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py b/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
index 52c6098f..060cb9d4 100644
--- a/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
+++ b/recipes/blizzard2013/tacotron1-Capacitron/train_capacitron_t1.py
@@ -48,7 +48,6 @@ config = TacotronConfig(
     precompute_num_workers=24,
     run_eval=True,
     test_delay_epochs=5,
-    ga_alpha=0.0,
     r=2,
     optimizer="CapacitronOptimizer",
     optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
@@ -68,16 +67,15 @@ config = TacotronConfig(
     datasets=[dataset_config],
     lr=1e-3,
     lr_scheduler="StepwiseGradualLR",
-    lr_scheduler_params={"gradual_learning_rates": [[0, 1e-3], [2e4, 5e-4], [4e5, 3e-4], [6e4, 1e-4], [8e4, 5e-5]]},
+    lr_scheduler_params={"gradual_learning_rates": [[0, 1e-3], [2e4, 5e-4], [4e4, 3e-4], [6e4, 1e-4], [8e4, 5e-5]]},
     scheduler_after_epoch=False,  # scheduler doesn't work without this flag
-    # Need to experiment with these below for capacitron
     loss_masking=False,
     decoder_loss_alpha=1.0,
     postnet_loss_alpha=1.0,
-    postnet_diff_spec_alpha=0.0,
-    decoder_diff_spec_alpha=0.0,
-    decoder_ssim_alpha=0.0,
-    postnet_ssim_alpha=0.0,
+    postnet_diff_spec_alpha=1.0,
+    decoder_diff_spec_alpha=1.0,
+    decoder_ssim_alpha=1.0,
+    postnet_ssim_alpha=1.0,
 )
 
 ap = AudioProcessor(**config.audio.to_dict())
diff --git a/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py b/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
index cf27b9df..1bd2a036 100644
--- a/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
+++ b/recipes/blizzard2013/tacotron2-Capacitron/train_capacitron_t2.py
@@ -52,7 +52,6 @@ config = Tacotron2Config(
     precompute_num_workers=24,
     run_eval=True,
     test_delay_epochs=5,
-    ga_alpha=0.0,
     r=2,
     optimizer="CapacitronOptimizer",
     optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
@@ -77,23 +76,20 @@ config = Tacotron2Config(
         "gradual_learning_rates": [
             [0, 1e-3],
             [2e4, 5e-4],
-            [4e5, 3e-4],
+            [4e4, 3e-4],
             [6e4, 1e-4],
             [8e4, 5e-5],
         ]
     },
     scheduler_after_epoch=False,  # scheduler doesn't work without this flag
-    # dashboard_logger='wandb',
-    # sort_by_audio_len=True,
     seq_len_norm=True,
-    # Need to experiment with these below for capacitron
     loss_masking=False,
     decoder_loss_alpha=1.0,
     postnet_loss_alpha=1.0,
-    postnet_diff_spec_alpha=0.0,
-    decoder_diff_spec_alpha=0.0,
-    decoder_ssim_alpha=0.0,
-    postnet_ssim_alpha=0.0,
+    postnet_diff_spec_alpha=1.0,
+    decoder_diff_spec_alpha=1.0,
+    decoder_ssim_alpha=1.0,
+    postnet_ssim_alpha=1.0,
 )
 
 ap = AudioProcessor(**config.audio.to_dict())
diff --git a/recipes/ljspeech/fast_pitch/train_fast_pitch.py b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
index a84658f3..1c0e4702 100644
--- a/recipes/ljspeech/fast_pitch/train_fast_pitch.py
+++ b/recipes/ljspeech/fast_pitch/train_fast_pitch.py
@@ -54,7 +54,6 @@ config = FastPitchConfig(
     print_step=50,
     print_eval=False,
     mixed_precision=False,
-    sort_by_audio_len=True,
     max_seq_len=500000,
     output_path=output_path,
     datasets=[dataset_config],
diff --git a/recipes/ljspeech/fast_speech/train_fast_speech.py b/recipes/ljspeech/fast_speech/train_fast_speech.py
index 0245dd93..ab7e8841 100644
--- a/recipes/ljspeech/fast_speech/train_fast_speech.py
+++ b/recipes/ljspeech/fast_speech/train_fast_speech.py
@@ -53,7 +53,6 @@ config = FastSpeechConfig(
     print_step=50,
     print_eval=False,
     mixed_precision=False,
-    sort_by_audio_len=True,
     max_seq_len=500000,
     output_path=output_path,
     datasets=[dataset_config],
diff --git a/recipes/ljspeech/speedy_speech/train_speedy_speech.py b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
index 1ab3db1c..fd3c8679 100644
--- a/recipes/ljspeech/speedy_speech/train_speedy_speech.py
+++ b/recipes/ljspeech/speedy_speech/train_speedy_speech.py
@@ -46,7 +46,6 @@ config = SpeedySpeechConfig(
     print_step=50,
     print_eval=False,
     mixed_precision=False,
-    sort_by_audio_len=True,
     max_seq_len=500000,
     output_path=output_path,
     datasets=[dataset_config],
diff --git a/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py b/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
index 6bb0aed7..a1882451 100644
--- a/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
+++ b/recipes/ljspeech/tacotron2-Capacitron/train_capacitron_t2.py
@@ -68,7 +68,6 @@ config = Tacotron2Config(
     print_step=25,
     print_eval=True,
     mixed_precision=False,
-    sort_by_audio_len=True,
     seq_len_norm=True,
     output_path=output_path,
     datasets=[dataset_config],
diff --git a/recipes/ljspeech/vits_tts/train_vits.py b/recipes/ljspeech/vits_tts/train_vits.py
index c070b3f1..94e230a1 100644
--- a/recipes/ljspeech/vits_tts/train_vits.py
+++ b/recipes/ljspeech/vits_tts/train_vits.py
@@ -2,11 +2,10 @@ import os
 
 from trainer import Trainer, TrainerArgs
 
-from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.models.vits import Vits
+from TTS.tts.models.vits import Vits, VitsAudioConfig
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 
@@ -14,21 +13,8 @@ output_path = os.path.dirname(os.path.abspath(__file__))
 dataset_config = BaseDatasetConfig(
     name="ljspeech", meta_file_train="metadata.csv", path=os.path.join(output_path, "../LJSpeech-1.1/")
 )
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    win_length=1024,
-    hop_length=256,
-    num_mels=80,
-    preemphasis=0.0,
-    ref_level_db=20,
-    log_func="np.log",
-    do_trim_silence=True,
-    trim_db=45,
-    mel_fmin=0,
-    mel_fmax=None,
-    spec_gain=1.0,
-    signal_norm=False,
-    do_amp_to_db_linear=False,
+audio_config = VitsAudioConfig(
+    sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
 )
 
 config = VitsConfig(
@@ -37,7 +23,7 @@ config = VitsConfig(
     batch_size=32,
     eval_batch_size=16,
     batch_group_size=5,
-    num_loader_workers=0,
+    num_loader_workers=8,
     num_eval_loader_workers=4,
     run_eval=True,
     test_delay_epochs=-1,
@@ -52,6 +38,7 @@ config = VitsConfig(
     mixed_precision=True,
     output_path=output_path,
     datasets=[dataset_config],
+    cudnn_benchmark=False,
 )
 
 # INITIALIZE THE AUDIO PROCESSOR
diff --git a/recipes/multilingual/vits_tts/train_vits_tts.py b/recipes/multilingual/vits_tts/train_vits_tts.py
index 0e650ade..0a9cced4 100644
--- a/recipes/multilingual/vits_tts/train_vits_tts.py
+++ b/recipes/multilingual/vits_tts/train_vits_tts.py
@@ -3,11 +3,10 @@ from glob import glob
 
 from trainer import Trainer, TrainerArgs
 
-from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs
+from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig
 from TTS.tts.utils.languages import LanguageManager
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
@@ -22,22 +21,13 @@ dataset_config = [
     for path in dataset_paths
 ]
 
-audio_config = BaseAudioConfig(
+audio_config = VitsAudioConfig(
     sample_rate=16000,
     win_length=1024,
     hop_length=256,
     num_mels=80,
-    preemphasis=0.0,
-    ref_level_db=20,
-    log_func="np.log",
-    do_trim_silence=False,
-    trim_db=23.0,
     mel_fmin=0,
     mel_fmax=None,
-    spec_gain=1.0,
-    signal_norm=True,
-    do_amp_to_db_linear=False,
-    resample=False,
 )
 
 vitsArgs = VitsArgs(
@@ -69,7 +59,6 @@ config = VitsConfig(
     use_language_weighted_sampler=True,
     print_eval=False,
     mixed_precision=False,
-    sort_by_audio_len=True,
     min_audio_len=32 * 256 * 4,
     max_audio_len=160000,
     output_path=output_path,
diff --git a/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py b/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py
index 1a4c8ec8..8f241306 100644
--- a/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py
+++ b/recipes/thorsten_DE/speedy_speech/train_speedy_speech.py
@@ -60,7 +60,6 @@ config = SpeedySpeechConfig(
         "Dieser Kuchen ist großartig. Er ist so lecker und feucht.",
         "Vor dem 22. November 1963.",
     ],
-    sort_by_audio_len=True,
     max_seq_len=500000,
     output_path=output_path,
     datasets=[dataset_config],
diff --git a/recipes/thorsten_DE/vits_tts/train_vits.py b/recipes/thorsten_DE/vits_tts/train_vits.py
index 86a7dfe6..25c57b64 100644
--- a/recipes/thorsten_DE/vits_tts/train_vits.py
+++ b/recipes/thorsten_DE/vits_tts/train_vits.py
@@ -2,11 +2,10 @@ import os
 
 from trainer import Trainer, TrainerArgs
 
-from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.models.vits import Vits
+from TTS.tts.models.vits import Vits, VitsAudioConfig
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
 from TTS.utils.downloaders import download_thorsten_de
@@ -21,21 +20,13 @@ if not os.path.exists(dataset_config.path):
     print("Downloading dataset")
     download_thorsten_de(os.path.split(os.path.abspath(dataset_config.path))[0])
 
-audio_config = BaseAudioConfig(
+audio_config = VitsAudioConfig(
     sample_rate=22050,
     win_length=1024,
     hop_length=256,
     num_mels=80,
-    preemphasis=0.0,
-    ref_level_db=20,
-    log_func="np.log",
-    do_trim_silence=True,
-    trim_db=45,
     mel_fmin=0,
     mel_fmax=None,
-    spec_gain=1.0,
-    signal_norm=False,
-    do_amp_to_db_linear=False,
 )
 
 config = VitsConfig(
diff --git a/recipes/vctk/download_vctk.sh b/recipes/vctk/download_vctk.sh
index c0cea743..d08a53c6 100644
--- a/recipes/vctk/download_vctk.sh
+++ b/recipes/vctk/download_vctk.sh
@@ -2,7 +2,7 @@
 # take the scripts's parent's directory to prefix all the output paths.
 RUN_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
 echo $RUN_DIR
-# download LJSpeech dataset
+# download VCTK dataset
 wget https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip -O VCTK-Corpus-0.92.zip
 # extract
 mkdir VCTK
diff --git a/recipes/vctk/vits/train_vits.py b/recipes/vctk/vits/train_vits.py
index 88fd7de9..814d0989 100644
--- a/recipes/vctk/vits/train_vits.py
+++ b/recipes/vctk/vits/train_vits.py
@@ -2,11 +2,10 @@ import os
 
 from trainer import Trainer, TrainerArgs
 
-from TTS.config.shared_configs import BaseAudioConfig
 from TTS.tts.configs.shared_configs import BaseDatasetConfig
 from TTS.tts.configs.vits_config import VitsConfig
 from TTS.tts.datasets import load_tts_samples
-from TTS.tts.models.vits import Vits, VitsArgs
+from TTS.tts.models.vits import Vits, VitsArgs, VitsAudioConfig
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
 from TTS.utils.audio import AudioProcessor
@@ -17,22 +16,8 @@ dataset_config = BaseDatasetConfig(
 )
 
 
-audio_config = BaseAudioConfig(
-    sample_rate=22050,
-    win_length=1024,
-    hop_length=256,
-    num_mels=80,
-    preemphasis=0.0,
-    ref_level_db=20,
-    log_func="np.log",
-    do_trim_silence=True,
-    trim_db=23.0,
-    mel_fmin=0,
-    mel_fmax=None,
-    spec_gain=1.0,
-    signal_norm=False,
-    do_amp_to_db_linear=False,
-    resample=True,
+audio_config = VitsAudioConfig(
+    sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
 )
 
 vitsArgs = VitsArgs(
@@ -62,6 +47,7 @@ config = VitsConfig(
     max_text_len=325,  # change this if you have a larger VRAM than 16GB
     output_path=output_path,
     datasets=[dataset_config],
+    cudnn_benchmark=False,
 )
 
 # INITIALIZE THE AUDIO PROCESSOR
diff --git a/requirements.txt b/requirements.txt
index b3acfeca..a9416112 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,15 @@
 # core deps
-numpy==1.21.6
+numpy==1.21.6;python_version<"3.10"
+numpy==1.22.4;python_version=="3.10"
 cython==0.29.28
 scipy>=1.4.0
 torch>=1.7
 torchaudio
 soundfile
 librosa==0.8.0
-numba==0.55.1
-inflect
+numba==0.55.1;python_version<"3.10"
+numba==0.55.2;python_version=="3.10"
+inflect==5.6.0
 tqdm
 anyascii
 pyyaml
diff --git a/run_bash_tests.sh b/run_bash_tests.sh
index feb9082b..2f5ba889 100755
--- a/run_bash_tests.sh
+++ b/run_bash_tests.sh
@@ -4,5 +4,4 @@ TF_CPP_MIN_LOG_LEVEL=3
 # runtime bash based tests
 # TODO: move these to python
 ./tests/bash_tests/test_demo_server.sh && \
-./tests/bash_tests/test_resample.sh && \
 ./tests/bash_tests/test_compute_statistics.sh
diff --git a/setup.py b/setup.py
index 3c860949..f95d79f1 100644
--- a/setup.py
+++ b/setup.py
@@ -90,7 +90,7 @@ setup(
     # ext_modules=find_cython_extensions(),
     # package
     include_package_data=True,
-    packages=find_packages(include=["TTS*"]),
+    packages=find_packages(include=["TTS"], exclude=["*.tests", "*tests.*", "tests.*", "*tests", "tests"]),
     package_data={
         "TTS": [
             "VERSION",
diff --git a/tests/aux_tests/test_audio_processor.py b/tests/aux_tests/test_audio_processor.py
index 56611692..d01aeffa 100644
--- a/tests/aux_tests/test_audio_processor.py
+++ b/tests/aux_tests/test_audio_processor.py
@@ -3,7 +3,7 @@ import unittest
 
 from tests import get_tests_input_path, get_tests_output_path, get_tests_path
 from TTS.config import BaseAudioConfig
-from TTS.utils.audio import AudioProcessor
+from TTS.utils.audio.processor import AudioProcessor
 
 TESTS_PATH = get_tests_path()
 OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
diff --git a/tests/aux_tests/test_numpy_transforms.py b/tests/aux_tests/test_numpy_transforms.py
new file mode 100644
index 00000000..0c1836b9
--- /dev/null
+++ b/tests/aux_tests/test_numpy_transforms.py
@@ -0,0 +1,105 @@
+import math
+import os
+import unittest
+from dataclasses import dataclass
+
+import librosa
+import numpy as np
+from coqpit import Coqpit
+
+from tests import get_tests_input_path, get_tests_output_path, get_tests_path
+from TTS.utils.audio import numpy_transforms as np_transforms
+
+TESTS_PATH = get_tests_path()
+OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
+WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
+
+os.makedirs(OUT_PATH, exist_ok=True)
+
+
+# pylint: disable=no-self-use
+
+
+class TestNumpyTransforms(unittest.TestCase):
+    def setUp(self) -> None:
+        @dataclass
+        class AudioConfig(Coqpit):
+            sample_rate: int = 22050
+            fft_size: int = 1024
+            num_mels: int = 256
+            mel_fmax: int = 1800
+            mel_fmin: int = 0
+            hop_length: int = 256
+            win_length: int = 1024
+            pitch_fmax: int = 450
+            trim_db: int = -1
+            min_silence_sec: float = 0.01
+            gain: float = 1.0
+            base: float = 10.0
+
+        self.config = AudioConfig()
+        self.sample_wav, _ = librosa.load(WAV_FILE, sr=self.config.sample_rate)
+
+    def test_build_mel_basis(self):
+        """Check if the mel basis is correctly built"""
+        print(" > Testing mel basis building.")
+        mel_basis = np_transforms.build_mel_basis(**self.config)
+        self.assertEqual(mel_basis.shape, (self.config.num_mels, self.config.fft_size // 2 + 1))
+
+    def test_millisec_to_length(self):
+        """Check if the conversion from milliseconds to length is correct"""
+        print(" > Testing millisec to length conversion.")
+        win_len, hop_len = np_transforms.millisec_to_length(
+            frame_length_ms=1000, frame_shift_ms=12.5, sample_rate=self.config.sample_rate
+        )
+        self.assertEqual(hop_len, int(12.5 / 1000.0 * self.config.sample_rate))
+        self.assertEqual(win_len, self.config.sample_rate)
+
+    def test_amplitude_db_conversion(self):
+        di = np.random.rand(11)
+        o1 = np_transforms.amp_to_db(x=di, gain=1.0, base=10)
+        o2 = np_transforms.db_to_amp(x=o1, gain=1.0, base=10)
+        np.testing.assert_almost_equal(di, o2, decimal=5)
+
+    def test_preemphasis_deemphasis(self):
+        di = np.random.rand(11)
+        o1 = np_transforms.preemphasis(x=di, coeff=0.95)
+        o2 = np_transforms.deemphasis(x=o1, coeff=0.95)
+        np.testing.assert_almost_equal(di, o2, decimal=5)
+
+    def test_spec_to_mel(self):
+        mel_basis = np_transforms.build_mel_basis(**self.config)
+        spec = np.random.rand(self.config.fft_size // 2 + 1, 20)  # [C, T]
+        mel = np_transforms.spec_to_mel(spec=spec, mel_basis=mel_basis)
+        self.assertEqual(mel.shape, (self.config.num_mels, 20))
+
+    def mel_to_spec(self):
+        mel_basis = np_transforms.build_mel_basis(**self.config)
+        mel = np.random.rand(self.config.num_mels, 20)  # [C, T]
+        spec = np_transforms.mel_to_spec(mel=mel, mel_basis=mel_basis)
+        self.assertEqual(spec.shape, (self.config.fft_size // 2 + 1, 20))
+
+    def test_wav_to_spec(self):
+        spec = np_transforms.wav_to_spec(wav=self.sample_wav, **self.config)
+        self.assertEqual(
+            spec.shape, (self.config.fft_size // 2 + 1, math.ceil(self.sample_wav.shape[0] / self.config.hop_length))
+        )
+
+    def test_wav_to_mel(self):
+        mel_basis = np_transforms.build_mel_basis(**self.config)
+        mel = np_transforms.wav_to_mel(wav=self.sample_wav, mel_basis=mel_basis, **self.config)
+        self.assertEqual(
+            mel.shape, (self.config.num_mels, math.ceil(self.sample_wav.shape[0] / self.config.hop_length))
+        )
+
+    def test_compute_f0(self):
+        pitch = np_transforms.compute_f0(x=self.sample_wav, **self.config)
+        mel_basis = np_transforms.build_mel_basis(**self.config)
+        mel = np_transforms.wav_to_mel(wav=self.sample_wav, mel_basis=mel_basis, **self.config)
+        assert pitch.shape[0] == mel.shape[1]
+
+    def test_load_wav(self):
+        wav = np_transforms.load_wav(filename=WAV_FILE, resample=False, sample_rate=22050)
+        wav_resample = np_transforms.load_wav(filename=WAV_FILE, resample=True, sample_rate=16000)
+        self.assertEqual(wav.shape, (self.sample_wav.shape[0],))
+        self.assertNotEqual(wav_resample.shape, (self.sample_wav.shape[0],))
diff --git a/tests/aux_tests/test_remove_silence_vad_script.py b/tests/aux_tests/test_remove_silence_vad_script.py
deleted file mode 100644
index c934e065..00000000
--- a/tests/aux_tests/test_remove_silence_vad_script.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import os
-import unittest
-
-import torch
-
-from tests import get_tests_input_path, get_tests_output_path, run_cli
-
-torch.manual_seed(1)
-
-# pylint: disable=protected-access
-class TestRemoveSilenceVAD(unittest.TestCase):
-    @staticmethod
-    def test():
-        # set paths
-        wav_path = os.path.join(get_tests_input_path(), "../data/ljspeech/wavs")
-        output_path = os.path.join(get_tests_output_path(), "output_wavs_removed_silence/")
-        output_resample_path = os.path.join(get_tests_output_path(), "output_ljspeech_16khz/")
-
-        # resample audios
-        run_cli(
-            f'CUDA_VISIBLE_DEVICES="" python TTS/bin/resample.py --input_dir "{wav_path}" --output_dir "{output_resample_path}" --output_sr 16000'
-        )
-
-        # run test
-        run_cli(
-            f'CUDA_VISIBLE_DEVICES="" python TTS/bin/remove_silence_using_vad.py --input_dir "{output_resample_path}" --output_dir "{output_path}"'
-        )
-        run_cli(f'rm -rf "{output_resample_path}"')
-        run_cli(f'rm -rf "{output_path}"')
diff --git a/tests/bash_tests/test_resample.sh b/tests/bash_tests/test_resample.sh
deleted file mode 100755
index ba871272..00000000
--- a/tests/bash_tests/test_resample.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env bash
-set -xe
-BASEDIR=$(dirname "$0")
-TARGET_SR=16000
-echo "$BASEDIR"
-#run the resample script
-python TTS/bin/resample.py --input_dir $BASEDIR/../data/ljspeech --output_dir $BASEDIR/outputs/resample_tests --output_sr $TARGET_SR
-#check samplerate of output
-OUT_SR=$( (echo "import librosa" ; echo "y, sr = librosa.load('"$BASEDIR"/outputs/resample_tests/wavs/LJ001-0012.wav', sr=None)" ; echo "print(sr)") | python )
-OUT_SR=$(($OUT_SR + 0))
-if [[ $OUT_SR -ne $TARGET_SR ]]; then
-    echo "Missmatch between target and output sample rates"
-    exit 1
-fi
-#cleaning up
-rm -rf $BASEDIR/outputs/resample_tests
diff --git a/tests/data_tests/test_samplers.py b/tests/data_tests/test_samplers.py
index b85e0ec4..730d0d8b 100644
--- a/tests/data_tests/test_samplers.py
+++ b/tests/data_tests/test_samplers.py
@@ -5,11 +5,11 @@ import unittest
 import torch
 
 from TTS.config.shared_configs import BaseDatasetConfig
-from TTS.encoder.utils.samplers import PerfectBatchSampler
 from TTS.tts.datasets import load_tts_samples
 from TTS.tts.utils.data import get_length_balancer_weights
 from TTS.tts.utils.languages import get_language_balancer_weights
 from TTS.tts.utils.speakers import get_speaker_balancer_weights
+from TTS.utils.samplers import BucketBatchSampler, PerfectBatchSampler
 
 # Fixing random state to avoid random fails
 torch.manual_seed(0)
@@ -163,3 +163,31 @@ class TestSamplers(unittest.TestCase):
                 else:
                     len2 += 1
             assert is_balanced(len1, len2), "Length Weighted sampler is supposed to be balanced"
+
+    def test_bucket_batch_sampler(self):
+        bucket_size_multiplier = 2
+        sampler = range(len(train_samples))
+        sampler = BucketBatchSampler(
+            sampler,
+            data=train_samples,
+            batch_size=7,
+            drop_last=True,
+            sort_key=lambda x: len(x["text"]),
+            bucket_size_multiplier=bucket_size_multiplier,
+        )
+
+        # check if the samples are sorted by text lenght whuile bucketing
+        min_text_len_in_bucket = 0
+        bucket_items = []
+        for batch_idx, batch in enumerate(list(sampler)):
+            if (batch_idx + 1) % bucket_size_multiplier == 0:
+                for bucket_item in bucket_items:
+                    self.assertLessEqual(min_text_len_in_bucket, len(train_samples[bucket_item]["text"]))
+                    min_text_len_in_bucket = len(train_samples[bucket_item]["text"])
+                min_text_len_in_bucket = 0
+                bucket_items = []
+            else:
+                bucket_items += batch
+
+        # check sampler length
+        self.assertEqual(len(sampler), len(train_samples) // 7)
diff --git a/tests/text_tests/test_tokenizer.py b/tests/text_tests/test_tokenizer.py
index 908952ea..6e95c0ad 100644
--- a/tests/text_tests/test_tokenizer.py
+++ b/tests/text_tests/test_tokenizer.py
@@ -30,6 +30,13 @@ class TestTTSTokenizer(unittest.TestCase):
         test_hat = self.tokenizer_ph.ids_to_text(ids)
         self.assertEqual(text_ph, test_hat)
 
+    def test_text_to_ids_phonemes_punctuation(self):
+        text = "..."
+        text_ph = self.ph.phonemize(text, separator="")
+        ids = self.tokenizer_ph.text_to_ids(text)
+        test_hat = self.tokenizer_ph.ids_to_text(ids)
+        self.assertEqual(text_ph, test_hat)
+
     def test_text_to_ids_phonemes_with_eos_bos(self):
         text = "Bu bir Örnek."
         self.tokenizer_ph.use_eos_bos = True
diff --git a/tests/tts_tests/test_losses.py b/tests/tts_tests/test_losses.py
new file mode 100644
index 00000000..522b7bb1
--- /dev/null
+++ b/tests/tts_tests/test_losses.py
@@ -0,0 +1,239 @@
+import unittest
+
+import torch as T
+
+from TTS.tts.layers.losses import BCELossMasked, L1LossMasked, MSELossMasked, SSIMLoss
+from TTS.tts.utils.helpers import sequence_mask
+
+
+class L1LossMaskedTests(unittest.TestCase):
+    def test_in_out(self):  # pylint: disable=no-self-use
+        # test input == target
+        layer = L1LossMasked(seq_len_norm=False)
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.ones(4, 8, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 0.0
+
+        # test input != target
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.zeros(4, 8, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+
+        # test if padded values of input makes any difference
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.zeros(4, 8, 128).float()
+        dummy_length = (T.arange(5, 9)).long()
+        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+
+        dummy_input = T.rand(4, 8, 128).float()
+        dummy_target = dummy_input.detach()
+        dummy_length = (T.arange(5, 9)).long()
+        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert output.item() == 0, "0 vs {}".format(output.item())
+
+        # seq_len_norm = True
+        # test input == target
+        layer = L1LossMasked(seq_len_norm=True)
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.ones(4, 8, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 0.0
+
+        # test input != target
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.zeros(4, 8, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+
+        # test if padded values of input makes any difference
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.zeros(4, 8, 128).float()
+        dummy_length = (T.arange(5, 9)).long()
+        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
+
+        dummy_input = T.rand(4, 8, 128).float()
+        dummy_target = dummy_input.detach()
+        dummy_length = (T.arange(5, 9)).long()
+        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert output.item() == 0, "0 vs {}".format(output.item())
+
+
+class MSELossMaskedTests(unittest.TestCase):
+    def test_in_out(self):  # pylint: disable=no-self-use
+        # test input == target
+        layer = MSELossMasked(seq_len_norm=False)
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.ones(4, 8, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 0.0
+
+        # test input != target
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.zeros(4, 8, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+
+        # test if padded values of input makes any difference
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.zeros(4, 8, 128).float()
+        dummy_length = (T.arange(5, 9)).long()
+        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+
+        dummy_input = T.rand(4, 8, 128).float()
+        dummy_target = dummy_input.detach()
+        dummy_length = (T.arange(5, 9)).long()
+        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert output.item() == 0, "0 vs {}".format(output.item())
+
+        # seq_len_norm = True
+        # test input == target
+        layer = MSELossMasked(seq_len_norm=True)
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.ones(4, 8, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 0.0
+
+        # test input != target
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.zeros(4, 8, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+
+        # test if padded values of input makes any difference
+        dummy_input = T.ones(4, 8, 128).float()
+        dummy_target = T.zeros(4, 8, 128).float()
+        dummy_length = (T.arange(5, 9)).long()
+        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
+
+        dummy_input = T.rand(4, 8, 128).float()
+        dummy_target = dummy_input.detach()
+        dummy_length = (T.arange(5, 9)).long()
+        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert output.item() == 0, "0 vs {}".format(output.item())
+
+
+class SSIMLossTests(unittest.TestCase):
+    def test_in_out(self):  # pylint: disable=no-self-use
+        # test input == target
+        layer = SSIMLoss()
+        dummy_input = T.ones(4, 57, 128).float()
+        dummy_target = T.ones(4, 57, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 0.0
+
+        # test input != target
+        dummy_input = T.arange(0, 4 * 57 * 128)
+        dummy_input = dummy_input.reshape(4, 57, 128).float()
+        dummy_target = T.arange(-4 * 57 * 128, 0)
+        dummy_target = dummy_target.reshape(4, 57, 128).float()
+        dummy_target = -dummy_target
+
+        dummy_length = (T.ones(4) * 58).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() >= 1.0, "0 vs {}".format(output.item())
+
+        # test if padded values of input makes any difference
+        dummy_input = T.ones(4, 57, 128).float()
+        dummy_target = T.zeros(4, 57, 128).float()
+        dummy_length = (T.arange(54, 58)).long()
+        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert output.item() == 0.0
+
+        dummy_input = T.rand(4, 57, 128).float()
+        dummy_target = dummy_input.detach()
+        dummy_length = (T.arange(54, 58)).long()
+        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert output.item() == 0, "0 vs {}".format(output.item())
+
+        # seq_len_norm = True
+        # test input == target
+        layer = L1LossMasked(seq_len_norm=True)
+        dummy_input = T.ones(4, 57, 128).float()
+        dummy_target = T.ones(4, 57, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 0.0
+
+        # test input != target
+        dummy_input = T.ones(4, 57, 128).float()
+        dummy_target = T.zeros(4, 57, 128).float()
+        dummy_length = (T.ones(4) * 8).long()
+        output = layer(dummy_input, dummy_target, dummy_length)
+        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
+
+        # test if padded values of input makes any difference
+        dummy_input = T.ones(4, 57, 128).float()
+        dummy_target = T.zeros(4, 57, 128).float()
+        dummy_length = (T.arange(54, 58)).long()
+        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
+
+        dummy_input = T.rand(4, 57, 128).float()
+        dummy_target = dummy_input.detach()
+        dummy_length = (T.arange(54, 58)).long()
+        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
+        output = layer(dummy_input + mask, dummy_target, dummy_length)
+        assert output.item() == 0, "0 vs {}".format(output.item())
+
+
+class BCELossTest(unittest.TestCase):
+    def test_in_out(self):  # pylint: disable=no-self-use
+        layer = BCELossMasked(pos_weight=5.0)
+
+        length = T.tensor([95])
+        target = (
+            1.0 - sequence_mask(length - 1, 100).float()
+        )  # [0, 0, .... 1, 1] where the first 1 is the last mel frame
+        true_x = target * 200 - 100  # creates logits of [-100, -100, ... 100, 100] corresponding to target
+        zero_x = T.zeros(target.shape) - 100.0  # simulate logits if it never stops decoding
+        early_x = -200.0 * sequence_mask(length - 3, 100).float() + 100.0  # simulate logits on early stopping
+        late_x = -200.0 * sequence_mask(length + 1, 100).float() + 100.0  # simulate logits on late stopping
+
+        loss = layer(true_x, target, length)
+        self.assertEqual(loss.item(), 0.0)
+
+        loss = layer(early_x, target, length)
+        self.assertAlmostEqual(loss.item(), 2.1053, places=4)
+
+        loss = layer(late_x, target, length)
+        self.assertAlmostEqual(loss.item(), 5.2632, places=4)
+
+        loss = layer(zero_x, target, length)
+        self.assertAlmostEqual(loss.item(), 5.2632, places=4)
+
+        # pos_weight should be < 1 to penalize early stopping
+        layer = BCELossMasked(pos_weight=0.2)
+        loss = layer(true_x, target, length)
+        self.assertEqual(loss.item(), 0.0)
+
+        # when pos_weight < 1 overweight the early stopping loss
+
+        loss_early = layer(early_x, target, length)
+        loss_late = layer(late_x, target, length)
+        self.assertGreater(loss_early.item(), loss_late.item())
diff --git a/tests/tts_tests/test_tacotron_layers.py b/tests/tts_tests/test_tacotron_layers.py
index fdce75dd..43e72417 100644
--- a/tests/tts_tests/test_tacotron_layers.py
+++ b/tests/tts_tests/test_tacotron_layers.py
@@ -2,9 +2,7 @@ import unittest
 
 import torch as T
 
-from TTS.tts.layers.losses import L1LossMasked, SSIMLoss
 from TTS.tts.layers.tacotron.tacotron import CBHG, Decoder, Encoder, Prenet
-from TTS.tts.utils.helpers import sequence_mask
 
 # pylint: disable=unused-variable
 
@@ -85,131 +83,3 @@ class EncoderTests(unittest.TestCase):
         assert output.shape[0] == 4
         assert output.shape[1] == 8
         assert output.shape[2] == 256  # 128 * 2 BiRNN
-
-
-class L1LossMaskedTests(unittest.TestCase):
-    def test_in_out(self):  # pylint: disable=no-self-use
-        # test input == target
-        layer = L1LossMasked(seq_len_norm=False)
-        dummy_input = T.ones(4, 8, 128).float()
-        dummy_target = T.ones(4, 8, 128).float()
-        dummy_length = (T.ones(4) * 8).long()
-        output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 0.0
-
-        # test input != target
-        dummy_input = T.ones(4, 8, 128).float()
-        dummy_target = T.zeros(4, 8, 128).float()
-        dummy_length = (T.ones(4) * 8).long()
-        output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
-
-        # test if padded values of input makes any difference
-        dummy_input = T.ones(4, 8, 128).float()
-        dummy_target = T.zeros(4, 8, 128).float()
-        dummy_length = (T.arange(5, 9)).long()
-        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
-        output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
-
-        dummy_input = T.rand(4, 8, 128).float()
-        dummy_target = dummy_input.detach()
-        dummy_length = (T.arange(5, 9)).long()
-        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
-        output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
-
-        # seq_len_norm = True
-        # test input == target
-        layer = L1LossMasked(seq_len_norm=True)
-        dummy_input = T.ones(4, 8, 128).float()
-        dummy_target = T.ones(4, 8, 128).float()
-        dummy_length = (T.ones(4) * 8).long()
-        output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 0.0
-
-        # test input != target
-        dummy_input = T.ones(4, 8, 128).float()
-        dummy_target = T.zeros(4, 8, 128).float()
-        dummy_length = (T.ones(4) * 8).long()
-        output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
-
-        # test if padded values of input makes any difference
-        dummy_input = T.ones(4, 8, 128).float()
-        dummy_target = T.zeros(4, 8, 128).float()
-        dummy_length = (T.arange(5, 9)).long()
-        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
-        output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
-
-        dummy_input = T.rand(4, 8, 128).float()
-        dummy_target = dummy_input.detach()
-        dummy_length = (T.arange(5, 9)).long()
-        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
-        output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
-
-
-class SSIMLossTests(unittest.TestCase):
-    def test_in_out(self):  # pylint: disable=no-self-use
-        # test input == target
-        layer = SSIMLoss()
-        dummy_input = T.ones(4, 8, 128).float()
-        dummy_target = T.ones(4, 8, 128).float()
-        dummy_length = (T.ones(4) * 8).long()
-        output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 0.0
-
-        # test input != target
-        dummy_input = T.ones(4, 8, 128).float()
-        dummy_target = T.zeros(4, 8, 128).float()
-        dummy_length = (T.ones(4) * 8).long()
-        output = layer(dummy_input, dummy_target, dummy_length)
-        assert abs(output.item() - 1.0) < 1e-4, "1.0 vs {}".format(output.item())
-
-        # test if padded values of input makes any difference
-        dummy_input = T.ones(4, 8, 128).float()
-        dummy_target = T.zeros(4, 8, 128).float()
-        dummy_length = (T.arange(5, 9)).long()
-        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
-        output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert abs(output.item() - 1.0) < 1e-4, "1.0 vs {}".format(output.item())
-
-        dummy_input = T.rand(4, 8, 128).float()
-        dummy_target = dummy_input.detach()
-        dummy_length = (T.arange(5, 9)).long()
-        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
-        output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
-
-        # seq_len_norm = True
-        # test input == target
-        layer = L1LossMasked(seq_len_norm=True)
-        dummy_input = T.ones(4, 8, 128).float()
-        dummy_target = T.ones(4, 8, 128).float()
-        dummy_length = (T.ones(4) * 8).long()
-        output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 0.0
-
-        # test input != target
-        dummy_input = T.ones(4, 8, 128).float()
-        dummy_target = T.zeros(4, 8, 128).float()
-        dummy_length = (T.ones(4) * 8).long()
-        output = layer(dummy_input, dummy_target, dummy_length)
-        assert output.item() == 1.0, "1.0 vs {}".format(output.item())
-
-        # test if padded values of input makes any difference
-        dummy_input = T.ones(4, 8, 128).float()
-        dummy_target = T.zeros(4, 8, 128).float()
-        dummy_length = (T.arange(5, 9)).long()
-        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
-        output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert abs(output.item() - 1.0) < 1e-5, "1.0 vs {}".format(output.item())
-
-        dummy_input = T.rand(4, 8, 128).float()
-        dummy_target = dummy_input.detach()
-        dummy_length = (T.arange(5, 9)).long()
-        mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
-        output = layer(dummy_input + mask, dummy_target, dummy_length)
-        assert output.item() == 0, "0 vs {}".format(output.item())
diff --git a/tests/tts_tests/test_vits.py b/tests/tts_tests/test_vits.py
index b9cebb5a..7d474c20 100644
--- a/tests/tts_tests/test_vits.py
+++ b/tests/tts_tests/test_vits.py
@@ -9,7 +9,17 @@ from tests import assertHasAttr, assertHasNotAttr, get_tests_data_path, get_test
 from TTS.config import load_config
 from TTS.encoder.utils.generic_utils import setup_encoder_model
 from TTS.tts.configs.vits_config import VitsConfig
-from TTS.tts.models.vits import Vits, VitsArgs, amp_to_db, db_to_amp, load_audio, spec_to_mel, wav_to_mel, wav_to_spec
+from TTS.tts.models.vits import (
+    Vits,
+    VitsArgs,
+    VitsAudioConfig,
+    amp_to_db,
+    db_to_amp,
+    load_audio,
+    spec_to_mel,
+    wav_to_mel,
+    wav_to_spec,
+)
 from TTS.tts.utils.speakers import SpeakerManager
 
 LANG_FILE = os.path.join(get_tests_input_path(), "language_ids.json")
@@ -421,8 +431,10 @@ class TestVits(unittest.TestCase):
         self._check_parameter_changes(model, model_ref)
 
     def test_train_step_upsampling(self):
+        """Upsampling by the decoder upsampling layers"""
         # setup the model
         with torch.autograd.set_detect_anomaly(True):
+            audio_config = VitsAudioConfig(sample_rate=22050)
             model_args = VitsArgs(
                 num_chars=32,
                 spec_segment_size=10,
@@ -430,7 +442,7 @@ class TestVits(unittest.TestCase):
                 interpolate_z=False,
                 upsample_rates_decoder=[8, 8, 4, 2],
             )
-            config = VitsConfig(model_args=model_args)
+            config = VitsConfig(model_args=model_args, audio=audio_config)
             model = Vits(config).to(device)
             model.train()
             # model to train
@@ -459,10 +471,18 @@ class TestVits(unittest.TestCase):
         self._check_parameter_changes(model, model_ref)
 
     def test_train_step_upsampling_interpolation(self):
+        """Upsampling by interpolation"""
         # setup the model
         with torch.autograd.set_detect_anomaly(True):
-            model_args = VitsArgs(num_chars=32, spec_segment_size=10, encoder_sample_rate=11025, interpolate_z=True)
-            config = VitsConfig(model_args=model_args)
+            audio_config = VitsAudioConfig(sample_rate=22050)
+            model_args = VitsArgs(
+                num_chars=32,
+                spec_segment_size=10,
+                encoder_sample_rate=11025,
+                interpolate_z=True,
+                upsample_rates_decoder=[8, 8, 2, 2],
+            )
+            config = VitsConfig(model_args=model_args, audio=audio_config)
             model = Vits(config).to(device)
             model.train()
             # model to train

From ec4501d31c388f3c6a64234fe08b3de08353e380 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Fri, 26 Aug 2022 15:36:01 +0200
Subject: [PATCH 19/32] Make artic formatter compatible with changes made to
 other formatters (root_path is a part of items)

---
 TTS/tts/datasets/formatters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 4ba14ef0..40b26679 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -599,7 +599,7 @@ def artic(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
             # In either way, wav name is stored in `cols[0]` and text in `cols[-1]`
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
             text = cols[-1]
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
     return items
 
 

From 8cfbe23d9e6bddea2a482c5c57f73bc68e9fea05 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Tue, 13 Sep 2022 17:32:25 +0200
Subject: [PATCH 20/32] Parse speaker name in artic dataset to extract language
 and append language item Add comments

---
 TTS/tts/datasets/formatters.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 40b26679..b811e88a 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -581,11 +581,25 @@ def kokoro(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
 
 
 def artic(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
-    """Normalizes the ARTIC meta data file to TTS format"""
+    """Normalizes the ARTIC meta data file to TTS format
+    
+    Args:
+        root_path (str): path to the artic dataset
+        meta_file (str): name of the meta file containing names of wav to select and
+                         transcripts of the corresponding utterances
+    
+    Returns:
+        List[List[str]]: List of (text, wav_path, speaker_name, language, root_path) associated with each utterance
+    """
     txt_file = os.path.join(root_path, meta_file)
     items = []
     # Speaker name is the name of the directory with the data (last part of `root_path`)
     speaker_name = os.path.basename(os.path.normpath(root_path))
+    # Speaker name can consists of language code (eg. cs-CZ) and gender (m/f) separated by dots
+    # Example: AndJa.cs-CZ.m
+    parts = speaker_name.split(".")
+    lang = parts[1] if len(parts) == 3 and "-" in parts[1] else None
+    print(f" > ARTIC dataset: voice {parts[0]}, language {lang}")
     with open(txt_file, "r", encoding="utf-8") as ttf:
         for line in ttf:
             # Check the number of standard separators
@@ -599,7 +613,7 @@ def artic(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
             # In either way, wav name is stored in `cols[0]` and text in `cols[-1]`
             wav_file = os.path.join(root_path, "wavs", cols[0] + ".wav")
             text = cols[-1]
-            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "root_path": root_path})
+            items.append({"text": text, "audio_file": wav_file, "speaker_name": speaker_name, "language": lang, "root_path": root_path})
     return items
 
 

From abca11714e6f2996fe037cf1ebc4e45e081e338e Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Thu, 23 Feb 2023 18:51:56 +0100
Subject: [PATCH 21/32] Support external durations and input text (sentence)
 based lenght scale in VITS Add aux_input to propagate user parameters to
 inference

---
 TTS/tts/models/vits.py     |  7 +++++--
 TTS/tts/utils/synthesis.py | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 14c76add..c52fde2e 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -1082,11 +1082,12 @@ class Vits(BaseTTS):
             return aux_input["x_lengths"]
         return torch.tensor(x.shape[1:2]).to(x.device)
 
+    # JMa: add `length_scale`` to `aux_input` to enable changing length (duration) per each input text (sentence)
     @torch.no_grad()
     def inference(
         self,
         x,
-        aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None, "durations": None},
+        aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None, "durations": None, "length_scale": None},
     ):  # pylint: disable=dangerous-default-value
         """
         Note:
@@ -1134,7 +1135,9 @@ class Vits(BaseTTS):
                 logw = self.duration_predictor(
                     x, x_mask, g=g if self.args.condition_dp_on_speaker else None, lang_emb=lang_emb
                 )
-            w = torch.exp(logw) * x_mask * self.length_scale
+            # JMa: length scale for the given sentence-like input
+            length_scale = aux_input["length_scale"] if aux_input["length_scale"] else self.length_scale
+            w = torch.exp(logw) * x_mask * length_scale
         else:
             assert durations.shape[-1] == x.shape[-1]
             w = durations.unsqueeze(0)
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 039816db..63a348d0 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -21,6 +21,7 @@ def compute_style_mel(style_wav, ap, cuda=False):
     return style_mel
 
 
+# JMa: add `aux_input` to enable extra input (length_scale, durations) 
 def run_model_torch(
     model: nn.Module,
     inputs: torch.Tensor,
@@ -29,6 +30,7 @@ def run_model_torch(
     style_text: str = None,
     d_vector: torch.Tensor = None,
     language_id: torch.Tensor = None,
+    aux_input: Dict = {},
 ) -> Dict:
     """Run a torch model for inference. It does not support batch inference.
 
@@ -56,6 +58,11 @@ def run_model_torch(
             "style_mel": style_mel,
             "style_text": style_text,
             "language_ids": language_id,
+            # JMa: add `durations`` and `length_scale`` to `aux_input` to enable changing length (durations) per each input text (sentence)
+            # - `length_scale` changes length of the whole generated wav
+            # - `durations` sets up duration (in frames) for each input text ID
+            "durations": aux_input.get("durations", None),
+            "length_scale": aux_input.get("length_scale", None),
         },
     )
     return outputs
@@ -110,6 +117,7 @@ def apply_griffin_lim(inputs, input_lens, CONFIG, ap):
     return wavs
 
 
+# JMa: add `aux_input` to enable extra input (length_scale, durations)
 def synthesis(
     model,
     text,
@@ -122,6 +130,7 @@ def synthesis(
     do_trim_silence=False,
     d_vector=None,
     language_id=None,
+    aux_input={},
 ):
     """Synthesize voice for the given text using Griffin-Lim vocoder or just compute output features to be passed to
     the vocoder model.
@@ -218,10 +227,14 @@ def synthesis(
         style_text,
         d_vector=d_vector,
         language_id=language_id,
+        # JMa: add `aux_input` to enable extra input (length_scale, durations)
+        aux_input=aux_input,
     )
     model_outputs = outputs["model_outputs"]
     model_outputs = model_outputs[0].data.cpu().numpy()
     alignments = outputs["alignments"]
+    # JMa: extract durations
+    durations = outputs.get("durations", None)
 
     # convert outputs to numpy
     # plot results
@@ -240,6 +253,8 @@ def synthesis(
         "alignments": alignments,
         "text_inputs": text_inputs,
         "outputs": outputs,
+        # JMa: return durations
+        "durations": durations,
     }
     return return_dict
 

From 8a29d57ff0131f536a05a5a9ca2661a4b62fd1cc Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Sat, 25 Feb 2023 11:26:55 +0100
Subject: [PATCH 22/32] Replace values by dict in output of synthesis() in
 test_run() in vits

---
 TTS/tts/models/vits.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index c52fde2e..ad8e65eb 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -1439,7 +1439,8 @@ class Vits(BaseTTS):
         test_sentences = self.config.test_sentences
         for idx, s_info in enumerate(test_sentences):
             aux_inputs = self.get_aux_input_from_test_sentences(s_info)
-            wav, alignment, _, _ = synthesis(
+            # JMa: replace individual variables with dictionary
+            outputs = synthesis(
                 self,
                 aux_inputs["text"],
                 self.config,
@@ -1450,9 +1451,9 @@ class Vits(BaseTTS):
                 language_id=aux_inputs["language_id"],
                 use_griffin_lim=True,
                 do_trim_silence=False,
-            ).values()
-            test_audios["{}-audio".format(idx)] = wav
-            test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)
+            )
+            test_audios["{}-audio".format(idx)] = outputs["wav"]
+            test_figures["{}-alignment".format(idx)] = plot_alignment(outputs["alignments"].T, output_fig=False)
         return {"figures": test_figures, "audios": test_audios}
 
     def test_log(

From 687789558e151c07b022b20506dd4a4ec00401c0 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Fri, 3 Mar 2023 20:41:19 +0100
Subject: [PATCH 23/32] Enable ensuring minimum length per token

---
 TTS/tts/models/vits.py     | 48 +++++++++++++++++++++++++++++++++++---
 TTS/tts/utils/synthesis.py | 31 +++++++++++-------------
 2 files changed, 59 insertions(+), 20 deletions(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index ad8e65eb..28f91a73 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -1081,13 +1081,48 @@ class Vits(BaseTTS):
         if "x_lengths" in aux_input and aux_input["x_lengths"] is not None:
             return aux_input["x_lengths"]
         return torch.tensor(x.shape[1:2]).to(x.device)
+    
+    # JMa: set minimum duration if predicted duration is lower than threshold
+    # Workaround to avoid short durations that cause some chars/phonemes to be reduced
+    # @staticmethod
+    # def _set_min_inference_length(d, threshold):
+    #     d_mask = d < threshold
+    #     d[d_mask] = threshold
+    #     return d
+    
+    def _set_min_inference_length(self, x, durs, threshold):
+        punctlike = list(self.config.characters.punctuations) + [self.config.characters.blank]
+        # Get list of tokens from IDs
+        tokens = x.squeeze().tolist()
+        # Check current and next token
+        n = self.tokenizer.characters.id_to_char(tokens[0])
+        # for ix, (c, n) in enumerate(zip(tokens[:-1], tokens[1:])):
+        for ix, idx in enumerate(tokens[1:]):
+            # c = self.tokenizer.characters.id_to_char(id_c)
+            c = n
+            n = self.tokenizer.characters.id_to_char(idx)
+            if c in punctlike:
+                # Skip thresholding for punctuation
+                continue
+            # Add duration from next punctuation if possible
+            d = durs[:,:,ix] + durs[:,:,ix+1] if n in punctlike else durs[:,:,ix]
+            # Threshold duration if duration lower than threshold
+            if d < threshold:
+                durs[:,:,ix] = threshold
+        return durs
 
-    # JMa: add `length_scale`` to `aux_input` to enable changing length (duration) per each input text (sentence)
     @torch.no_grad()
     def inference(
         self,
         x,
-        aux_input={"x_lengths": None, "d_vectors": None, "speaker_ids": None, "language_ids": None, "durations": None, "length_scale": None},
+        aux_input={"x_lengths": None,
+                   "d_vectors": None,
+                   "speaker_ids": None,
+                   "language_ids": None,
+                   "durations": None,
+                   "length_scale": None,    # JMa: add `length_scale`` to `aux_input` to enable changing length (duration) per each input text (sentence)
+                   "min_input_length": 0    # JMa: set minimum length if predicted length is lower than `min_input_length`
+                  },
     ):  # pylint: disable=dangerous-default-value
         """
         Note:
@@ -1107,6 +1142,9 @@ class Vits(BaseTTS):
             - m_p: :math:`[B, C, T_dec]`
             - logs_p: :math:`[B, C, T_dec]`
         """
+        # JMa: Save input
+        x_input = x
+
         sid, g, lid, durations = self._set_cond_input(aux_input)
         x_lengths = self._set_x_lengths(x, aux_input)
 
@@ -1135,9 +1173,13 @@ class Vits(BaseTTS):
                 logw = self.duration_predictor(
                     x, x_mask, g=g if self.args.condition_dp_on_speaker else None, lang_emb=lang_emb
                 )
+            # JMa: set minimum duration if required
+            # w = self._set_min_inference_length(torch.exp(logw) * x_mask, aux_input["min_input_length"]) if aux_input["min_input_length"] else torch.exp(logw) * x_mask
+            w = self._set_min_inference_length(x_input, torch.exp(logw) * x_mask, aux_input["min_input_length"]) if aux_input["min_input_length"] else torch.exp(logw) * x_mask
             # JMa: length scale for the given sentence-like input
             length_scale = aux_input["length_scale"] if aux_input["length_scale"] else self.length_scale
-            w = torch.exp(logw) * x_mask * length_scale
+            w *= length_scale
+            # w = torch.exp(logw) * x_mask * length_scale
         else:
             assert durations.shape[-1] == x.shape[-1]
             w = durations.unsqueeze(0)
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 63a348d0..30e1dcb0 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -30,7 +30,7 @@ def run_model_torch(
     style_text: str = None,
     d_vector: torch.Tensor = None,
     language_id: torch.Tensor = None,
-    aux_input: Dict = {},
+    aux_input: Dict = {"durations": None, "length_scale": None, "min_input_length": 0},
 ) -> Dict:
     """Run a torch model for inference. It does not support batch inference.
 
@@ -49,22 +49,19 @@ def run_model_torch(
         _func = model.module.inference
     else:
         _func = model.inference
-    outputs = _func(
-        inputs,
-        aux_input={
-            "x_lengths": input_lengths,
-            "speaker_ids": speaker_id,
-            "d_vectors": d_vector,
-            "style_mel": style_mel,
-            "style_text": style_text,
-            "language_ids": language_id,
-            # JMa: add `durations`` and `length_scale`` to `aux_input` to enable changing length (durations) per each input text (sentence)
-            # - `length_scale` changes length of the whole generated wav
-            # - `durations` sets up duration (in frames) for each input text ID
-            "durations": aux_input.get("durations", None),
-            "length_scale": aux_input.get("length_scale", None),
-        },
-    )
+    # JMa: propagate `durations``, `length_scale``, and  `min_input_length` to `aux_input`
+    #      to enable changing length (durations) per each input text (sentence) and to set
+    #      minimum allowed length of each input char/phoneme
+    #   - `length_scale` changes length of the whole generated wav
+    #   - `durations` sets up duration (in frames) for each input text ID
+    #   -  minimum allowed length (in frames) per input ID (char/phoneme) during inference
+    aux_input["x_lengths"] = input_lengths
+    aux_input["speaker_ids"] = speaker_id
+    aux_input["d_vectors"] = d_vector
+    aux_input["style_mel"] = style_mel
+    aux_input["style_text"] = style_text
+    aux_input["language_ids"] = language_id
+    outputs = _func(inputs, aux_input)
     return outputs
 
 

From fcfecf63105df073d9d5ce71feff10195c7991e0 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Thu, 9 Mar 2023 16:32:29 +0100
Subject: [PATCH 24/32] Fix usage of `aux_input["min_input_length"]` when
 running `test_run()` during training

---
 TTS/tts/models/vits.py     |  2 +-
 TTS/tts/utils/synthesis.py | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 28f91a73..a6dbf2e0 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -1175,7 +1175,7 @@ class Vits(BaseTTS):
                 )
             # JMa: set minimum duration if required
             # w = self._set_min_inference_length(torch.exp(logw) * x_mask, aux_input["min_input_length"]) if aux_input["min_input_length"] else torch.exp(logw) * x_mask
-            w = self._set_min_inference_length(x_input, torch.exp(logw) * x_mask, aux_input["min_input_length"]) if aux_input["min_input_length"] else torch.exp(logw) * x_mask
+            w = self._set_min_inference_length(x_input, torch.exp(logw) * x_mask, aux_input["min_input_length"]) if aux_input.get("min_input_length", 0) else torch.exp(logw) * x_mask
             # JMa: length scale for the given sentence-like input
             length_scale = aux_input["length_scale"] if aux_input["length_scale"] else self.length_scale
             w *= length_scale
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 30e1dcb0..59897452 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -30,7 +30,7 @@ def run_model_torch(
     style_text: str = None,
     d_vector: torch.Tensor = None,
     language_id: torch.Tensor = None,
-    aux_input: Dict = {"durations": None, "length_scale": None, "min_input_length": 0},
+    aux_input: Dict = {},
 ) -> Dict:
     """Run a torch model for inference. It does not support batch inference.
 
@@ -49,9 +49,9 @@ def run_model_torch(
         _func = model.module.inference
     else:
         _func = model.inference
-    # JMa: propagate `durations``, `length_scale``, and  `min_input_length` to `aux_input`
-    #      to enable changing length (durations) per each input text (sentence) and to set
-    #      minimum allowed length of each input char/phoneme
+    # JMa: propagate other inputs like `durations``, `length_scale``, and  `min_input_length`
+    #      to `aux_input` to enable changing length (durations) per each input text (sentence)
+    #      and to set minimum allowed length of each input char/phoneme
     #   - `length_scale` changes length of the whole generated wav
     #   - `durations` sets up duration (in frames) for each input text ID
     #   -  minimum allowed length (in frames) per input ID (char/phoneme) during inference
@@ -114,7 +114,7 @@ def apply_griffin_lim(inputs, input_lens, CONFIG, ap):
     return wavs
 
 
-# JMa: add `aux_input` to enable extra input (length_scale, durations)
+# JMa: add `aux_input` to enable extra input (like length_scale, durations)
 def synthesis(
     model,
     text,

From 67edc4e40f8c4f2a4d1ce8870c2a96ee1876a458 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Mon, 13 Mar 2023 21:13:51 +0100
Subject: [PATCH 25/32] Fix length scale handling and default value

---
 TTS/tts/models/vits.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index a6dbf2e0..edc6b904 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -1120,7 +1120,7 @@ class Vits(BaseTTS):
                    "speaker_ids": None,
                    "language_ids": None,
                    "durations": None,
-                   "length_scale": None,    # JMa: add `length_scale`` to `aux_input` to enable changing length (duration) per each input text (sentence)
+                   "length_scale": 1.0,     # JMa: add `length_scale`` to `aux_input` to enable changing length (duration) per each input text (sentence)
                    "min_input_length": 0    # JMa: set minimum length if predicted length is lower than `min_input_length`
                   },
     ):  # pylint: disable=dangerous-default-value
@@ -1177,7 +1177,7 @@ class Vits(BaseTTS):
             # w = self._set_min_inference_length(torch.exp(logw) * x_mask, aux_input["min_input_length"]) if aux_input["min_input_length"] else torch.exp(logw) * x_mask
             w = self._set_min_inference_length(x_input, torch.exp(logw) * x_mask, aux_input["min_input_length"]) if aux_input.get("min_input_length", 0) else torch.exp(logw) * x_mask
             # JMa: length scale for the given sentence-like input
-            length_scale = aux_input["length_scale"] if aux_input["length_scale"] else self.length_scale
+            length_scale = aux_input.get("length_scale", self.length_scale)
             w *= length_scale
             # w = torch.exp(logw) * x_mask * length_scale
         else:

From c04f5046a23552052235477cb57402659d2ecc6c Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Thu, 16 Mar 2023 15:42:52 +0100
Subject: [PATCH 26/32] Delete `synthesize_file.py` as it was moved to
 Coqui-TTS_utils repo

---
 TTS/bin/synthesize_file.py | 337 -------------------------------------
 1 file changed, 337 deletions(-)
 delete mode 100755 TTS/bin/synthesize_file.py

diff --git a/TTS/bin/synthesize_file.py b/TTS/bin/synthesize_file.py
deleted file mode 100755
index cae5092a..00000000
--- a/TTS/bin/synthesize_file.py
+++ /dev/null
@@ -1,337 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-import argparse
-from distutils.command.config import config
-import sys
-from argparse import RawTextHelpFormatter
-
-# pylint: disable=redefined-outer-name, unused-argument
-from pathlib import Path, PurePath
-
-sys.path.insert(0, "/storage/plzen4-ntis/home/jmatouse/GIT_repos/Coqui-TTS.mod-0.6.1")
-
-from TTS.utils.manage import ModelManager
-from TTS.utils.synthesizer import Synthesizer
-
-
-def str2bool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() in ("yes", "true", "t", "y", "1"):
-        return True
-    if v.lower() in ("no", "false", "f", "n", "0"):
-        return False
-    raise argparse.ArgumentTypeError("Boolean value expected.")
-
-
-def main():
-    # pylint: disable=bad-option-value
-    parser = argparse.ArgumentParser(
-        description="""Synthesize speech on command line.\n\n"""
-        """You can either use your trained model or choose a model from the provided list.\n\n"""
-        """If you don't specify any models, then it uses LJSpeech based English model.\n\n"""
-        """
-    # Example Runs:
-
-    ## Single Speaker Models
-
-    - list provided models
-
-    ```
-    $ ./TTS/bin/synthesize.py --list_models
-    ```
-
-    - run tts with default models.
-
-    ```
-    $ ./TTS/bin synthesize.py --text "Text for TTS"
-    ```
-
-    - run a tts model with its default vocoder model.
-
-    ```
-    $ ./TTS/bin synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>
-    ```
-
-    - run with specific tts and vocoder models from the list
-
-    ```
-    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_name "<language>/<dataset>/<model_name>" --vocoder_name "<language>/<dataset>/<model_name>" --output_path
-    ```
-
-    - run your own TTS model (Using Griffin-Lim Vocoder)
-
-    ```
-    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/model.pth.tar --config_path path/to/config.json --out_path output/path/speech.wav
-    ```
-
-    - run your own TTS and Vocoder models
-    ```
-    $ ./TTS/bin/synthesize.py --text "Text for TTS" --model_path path/to/config.json --config_path path/to/model.pth.tar --out_path output/path/speech.wav
-        --vocoder_path path/to/vocoder.pth.tar --vocoder_config_path path/to/vocoder_config.json
-    ```
-
-    ## MULTI-SPEAKER MODELS
-
-    - list the available speakers and choose as <speaker_id> among them.
-
-    ```
-    $ ./TTS/bin/synthesize.py --model_name "<language>/<dataset>/<model_name>"  --list_speaker_idxs
-    ```
-
-    - run the multi-speaker TTS model with the target speaker ID.
-
-    ```
-    $ ./TTS/bin/synthesize.py --text "Text for TTS." --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>"  --speaker_idx <speaker_id>
-    ```
-
-    - run your own multi-speaker TTS model.
-
-    ```
-    $ ./TTS/bin/synthesize.py --text "Text for TTS" --out_path output/path/speech.wav --model_path path/to/config.json --config_path path/to/model.pth.tar --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
-    ```
-    """,
-        formatter_class=RawTextHelpFormatter,
-    )
-
-    parser.add_argument(
-        "--list_models",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
-        help="list available pre-trained tts and vocoder models.",
-    )
-    parser.add_argument("--text_file", type=str, default=None, help="Text file to generate speech from.")
-
-    # Args for running pre-trained TTS models.
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        default="tts_models/en/ljspeech/tacotron2-DDC",
-        help="Name of one of the pre-trained tts models in format <language>/<dataset>/<model_name>",
-    )
-    parser.add_argument(
-        "--vocoder_name",
-        type=str,
-        default=None,
-        help="Name of one of the pre-trained  vocoder models in format <language>/<dataset>/<model_name>",
-    )
-
-    # Args for running custom models
-    parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
-    parser.add_argument(
-        "--model_path",
-        type=str,
-        default=None,
-        help="Path to model file.",
-    )
-    parser.add_argument(
-        "--out_dir",
-        type=str,
-        default="",
-        help="Output wav file path directory.",
-    )
-    parser.add_argument(
-        "--out_name",
-        type=str,
-        default="utt",
-        help="Output wav filename.",
-    )
-    parser.add_argument(
-        "--out_path",
-        type=str,
-        default="",
-        help="Output wav file path.",
-    )
-    parser.add_argument(
-        "--concat_audio",
-        action='store_true',
-        help="Concatenate audio to a single output file",
-        default=False
-    )
-    parser.add_argument(
-        "-1", "--use_infile_label",
-        action='store_true',
-        help="Use in-file label (1st word) as output file name",
-        default=False
-    )
-    parser.add_argument(
-        "--rm_last_word",
-        action='store_true',
-        help="Remove last word (typically corresponding to a pause)",
-        default=False
-    )
-    parser.add_argument("--use_cuda", action='store_true', help="Run model on CUDA.", default=False)
-    parser.add_argument(
-        "--vocoder_path",
-        type=str,
-        help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
-        default=None,
-    )
-    parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
-    parser.add_argument(
-        "--encoder_path",
-        type=str,
-        help="Path to speaker encoder model file.",
-        default=None,
-    )
-    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
-
-    # args for multi-speaker synthesis
-    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
-    parser.add_argument(
-        "--speaker_idx",
-        type=str,
-        help="Target speaker ID for a multi-speaker TTS model.",
-        default=None,
-    )
-    parser.add_argument(
-        "--speaker_wav",
-        nargs="+",
-        help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
-        default=None,
-    )
-    parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
-    parser.add_argument(
-        "--list_speaker_idxs",
-        help="List available speaker ids for the defined multi-speaker model.",
-        type=str2bool,
-        nargs="?",
-        const=True,
-        default=False,
-    )
-    # aux args
-    parser.add_argument(
-        "--save_spectogram",
-        action='store_true',
-        help="If true save raw spectogram for further (vocoder) processing in out_path.",
-        default=False,
-    )
-
-    args = parser.parse_args()
-
-    # print the description if either text or list_models is not set
-    if args.text_file is None and not args.list_models and not args.list_speaker_idxs:
-        parser.parse_args(["-h"])
-
-    # load model manager
-    path = Path(__file__).parent / "../.models.json"
-    manager = ModelManager(path)
-
-    model_path = None
-    config_path = None
-    speakers_file_path = None
-    vocoder_path = None
-    vocoder_config_path = None
-    encoder_path = None
-    encoder_config_path = None
-
-    # CASE1: list pre-trained TTS models
-    if args.list_models:
-        manager.list_models()
-        sys.exit()
-
-    # CASE2: load pre-trained model paths
-    if args.model_name is not None and not args.model_path:
-        model_path, config_path, model_item = manager.download_model(args.model_name)
-        args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name
-
-    if args.vocoder_name is not None and not args.vocoder_path:
-        vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
-
-    # CASE3: set custom model paths
-    # JMa: if config is not given => use config from the corresponding model/vocoder/encoder path
-    if args.model_path is not None:
-        model_path = args.model_path
-        config_path = args.config_path if args.config_path else PurePath(Path(model_path).parent, "config.json")
-        speakers_file_path = args.speakers_file_path
-
-    if args.vocoder_path is not None:
-        vocoder_path = args.vocoder_path
-        vocoder_config_path = args.vocoder_config_path if args.vocoder_config_path else PurePath(Path(vocoder_path).parent, "config.json")
-
-    if args.encoder_path is not None:
-        encoder_path = args.encoder_path
-        encoder_config_path = args.encoder_config_path if args.encoder_config_path else PurePath(Path(encoder_path).parent, "config.json")
-
-    # load models
-    synthesizer = Synthesizer(
-        model_path,
-        config_path,
-        speakers_file_path,
-        vocoder_path,
-        vocoder_config_path,
-        encoder_path,
-        encoder_config_path,
-        args.use_cuda,
-    )
-
-    # query speaker ids of a multi-speaker model.
-    if args.list_speaker_idxs:
-        print(
-            " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
-        )
-        print(synthesizer.tts_model.speaker_manager.speaker_ids)
-        return
-
-    # check the arguments against a multi-speaker model.
-    if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav):
-        print(
-            " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to "
-            "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
-        )
-        return
-    
-    # Read lines (=sentences) from the input text file
-    with open(args.text_file, 'rt') as fr:
-        lines = fr.read().splitlines()
-    
-    # Resulting wav
-    tot_wav = []
-
-    # RUN THE SYNTHESIS line-by-line
-    for ix, line in enumerate(lines):
-        # Extract words
-        words = line.split()
-
-        # Use first word as utterance name?
-        if args.use_infile_label:
-            uname = words[0]
-            sent_beg = 1
-        else:
-            uname = "{}{:03d}".format(args.out_name, ix)
-            sent_beg = 0
-        
-        # Remove last word?
-        sent_end = -1 if args.rm_last_word else len(words)
-
-        # Prepare text to synthesize
-        text = " ".join(words[sent_beg:sent_end])
-
-        # kick it
-        wav = synthesizer.tts(text, args.speaker_idx, args.speaker_wav, args.gst_style)
-
-        # Concatenate resulting wav
-        if args.concat_audio:
-            print(" > Text #{:03d}: {}".format(ix, text))
-            tot_wav.append(wav)
-        else:
-            # Save the wav for each line 
-            # print(" > Saving output to {}".format(out_path))
-            # Prepare output path
-            out_path = PurePath(args.out_dir, "{}.wav".format(uname))
-            print(" > Text #{:03d}: {} --> {}".format(ix, text, out_path))
-            synthesizer.save_wav(wav, out_path)
-    
-    if args.concat_audio:
-        # Concatenate resulting wav
-        print(" > Saving audio to {}".format(args.out_path))
-        single_wav = []
-        for wav in tot_wav:
-            single_wav.extend(list(wav))
-        synthesizer.save_wav(single_wav, args.out_path)
-
-if __name__ == "__main__":
-    main()

From d3661d7d26f794f8a4ea814769d9db81fbd03510 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Fri, 4 Aug 2023 13:54:51 +0200
Subject: [PATCH 27/32] Fix artic_multispeaker formatter

---
 TTS/tts/datasets/formatters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 4d5ee46a..68b27d61 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -641,7 +641,7 @@ def artic_multispeaker(root_path, meta_file, ignored_speakers=None): # pylint: d
     """
     items = []
     # Loop over speakers: speaker names are subdirs of `root_path`
-    for pth in glob(f"{root_path}/*", recursive=False):
+    for pth in glob(f"{root_path}/*/", recursive=False):
         speaker_name = os.path.basename(pth)
         # Ignore speakers
         if isinstance(ignored_speakers, list):

From 874143bf04ceaaa367ff6c4675821ad51aa19ab8 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Sun, 6 Aug 2023 13:17:53 +0200
Subject: [PATCH 28/32] Add support for phone (char) based length scale Remove
 length_scale from default aux_input

---
 TTS/tts/models/vits.py     | 23 +++++++++++++++++++----
 TTS/tts/utils/synthesis.py |  1 +
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 29a0646b..c001412a 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -1123,7 +1123,6 @@ class Vits(BaseTTS):
                    "speaker_ids": None,
                    "language_ids": None,
                    "durations": None,
-                   "length_scale": 1.0,     # JMa: add `length_scale`` to `aux_input` to enable changing length (duration) per each input text (sentence)
                    "min_input_length": 0    # JMa: set minimum length if predicted length is lower than `min_input_length`
                   },
     ):  # pylint: disable=dangerous-default-value
@@ -1136,6 +1135,8 @@ class Vits(BaseTTS):
             - x_lengths: :math:`[B]`
             - d_vectors: :math:`[B, C]`
             - speaker_ids: :math:`[B]`
+            - durations: :math: `[B, T_seq]`
+            - length_scale: :math: `[B, T_seq]` or `[B]` 
 
         Return Shapes:
             - model_outputs: :math:`[B, 1, T_wav]`
@@ -1177,13 +1178,27 @@ class Vits(BaseTTS):
                     x, x_mask, g=g if self.args.condition_dp_on_speaker else None, lang_emb=lang_emb
                 )
             # JMa: set minimum duration if required
-            # w = self._set_min_inference_length(torch.exp(logw) * x_mask, aux_input["min_input_length"]) if aux_input["min_input_length"] else torch.exp(logw) * x_mask
             w = self._set_min_inference_length(x_input, torch.exp(logw) * x_mask, aux_input["min_input_length"]) if aux_input.get("min_input_length", 0) else torch.exp(logw) * x_mask
+            
             # JMa: length scale for the given sentence-like input
+            # ORIG: w = torch.exp(logw) * x_mask * self.length_scale
+            # If `length_scale` is in `aux_input`, it resets the default value given by `self.length_scale`,
+            # otherwise the default `self.length_scale` from `config.json` is used.
             length_scale = aux_input.get("length_scale", self.length_scale)
-            w *= length_scale
-            # w = torch.exp(logw) * x_mask * length_scale
+            # JMa: `length_scale` is used to scale duration relatively to the predicted values, it should be:
+            # - float (or int) => duration of the output speech will be linearly scaled
+            # - torch vector `[B, T_seq]`` (`B`` is batch size, `T_seq`` is the length of the input symbols)
+            #   => each input symbol (phone or char) is scaled according to the corresponding value in the torch vector
+            if isinstance(length_scale, float) or isinstance(length_scale, int):
+                w *= length_scale
+            else:
+                assert length_scale.shape[-1] == w.shape[-1]
+                w *= length_scale.unsqueeze(0)
+        
         else:
+            # To force absolute durations (in frames), "durations" has to be in `aux_input`.
+            # The durations should be a torch vector [B, N] (`B`` is batch size, `T_seq`` is the length of the input symbols)
+            # => each input symbol (phone or char) will have the duration given by the corresponding value (number of frames) in the torch vector
             assert durations.shape[-1] == x.shape[-1]
             w = durations.unsqueeze(0)
 
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 59897452..ccdecaf6 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -215,6 +215,7 @@ def synthesis(
 
     text_inputs = numpy_to_torch(text_inputs, torch.long, cuda=use_cuda)
     text_inputs = text_inputs.unsqueeze(0)
+
     # synthesize voice
     outputs = run_model_torch(
         model,

From 37807fef8b22aa39f60568edf8b235e31f01c604 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Wed, 23 Aug 2023 11:52:14 +0100
Subject: [PATCH 29/32] Add vctk_wav formatter: it is the same as vctk but uses
 wav extension instead of flac

---
 TTS/tts/datasets/formatters.py | 48 ++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 2807cd6c..2ea75ec3 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -403,6 +403,54 @@ def vctk(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic
     return items
 
 
+# JMa: VCTK with wav files (not flac)
+def vctk_wav(root_path, meta_files=None, wavs_path="wav48_silence_trimmed", mic="mic1", ignored_speakers=None):
+    """VCTK dataset v0.92.
+
+    URL:
+        https://datashare.ed.ac.uk/bitstream/handle/10283/3443/VCTK-Corpus-0.92.zip
+
+    This dataset has 2 recordings per speaker that are annotated with ```mic1``` and ```mic2```.
+    It is believed that (😄 ) ```mic1``` files are the same as the previous version of the dataset.
+
+    mic1:
+        Audio recorded using an omni-directional microphone (DPA 4035).
+        Contains very low frequency noises.
+        This is the same audio released in previous versions of VCTK:
+        https://doi.org/10.7488/ds/1994
+
+    mic2:
+        Audio recorded using a small diaphragm condenser microphone with
+        very wide bandwidth (Sennheiser MKH 800).
+        Two speakers, p280 and p315 had technical issues of the audio
+        recordings using MKH 800.
+    """
+    file_ext = "wav"
+    items = []
+    meta_files = glob(f"{os.path.join(root_path,'txt')}/**/*.txt", recursive=True)
+    for meta_file in meta_files:
+        _, speaker_id, txt_file = os.path.relpath(meta_file, root_path).split(os.sep)
+        file_id = txt_file.split(".")[0]
+        # ignore speakers
+        if isinstance(ignored_speakers, list):
+            if speaker_id in ignored_speakers:
+                continue
+        with open(meta_file, "r", encoding="utf-8") as file_text:
+            text = file_text.readlines()[0]
+        # p280 has no mic2 recordings
+        if speaker_id == "p280":
+            wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_mic1.{file_ext}")
+        else:
+            wav_file = os.path.join(root_path, wavs_path, speaker_id, file_id + f"_{mic}.{file_ext}")
+        if os.path.exists(wav_file):
+            items.append(
+                {"text": text, "audio_file": wav_file, "speaker_name": "VCTK_" + speaker_id, "root_path": root_path}
+            )
+        else:
+            print(f" [!] wav files don't exist - {wav_file}")
+    return items
+
+
 def vctk_old(root_path, meta_files=None, wavs_path="wav48", ignored_speakers=None):
     """homepages.inf.ed.ac.uk/jyamagis/release/VCTK-Corpus.tar.gz"""
     items = []

From a0db2eeee81afa64657a1268b867fa02b0035c11 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Wed, 6 Sep 2023 13:59:24 +0200
Subject: [PATCH 30/32] Fix: add `is_eval` when calling `get_sampler` to
 respect training/validation

---
 TTS/tts/models/vits.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 057ffe7e..c11d9b96 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -1679,7 +1679,8 @@ class Vits(BaseTTS):
             dataset.preprocess_samples()
 
             # get samplers
-            sampler = self.get_sampler(config, dataset, num_gpus)
+            # JMa: Add `is_eval` parameter because the default is `False` and `batch_size` was used instead of `eval_batch_size`
+            sampler = self.get_sampler(config, dataset, num_gpus, is_eval)
             if sampler is None:
                 loader = DataLoader(
                     dataset,

From c312343585daf7da2ddd275d37da446419cb4367 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Wed, 6 Sep 2023 17:05:47 +0200
Subject: [PATCH 31/32] Language of each item (sample/utterance) is set to
 dataset language only when not defined at the sample/utterance level Speaker
 name is prepended by dataset name in case of multispeaker datasets Refactor
 "artic" formatter

---
 TTS/tts/datasets/__init__.py   |  8 ++++++--
 TTS/tts/datasets/formatters.py | 13 ++++++++-----
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py
index 19213856..02434917 100644
--- a/TTS/tts/datasets/__init__.py
+++ b/TTS/tts/datasets/__init__.py
@@ -58,8 +58,12 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
 
 def add_extra_keys(metadata, language, dataset_name):
     for item in metadata:
-        # add language name
-        item["language"] = language
+        # JMa: Add language name only if not defined at the sample level. Could  be good for multi-language datasets.
+        if not item["language"]:
+            item["language"] = language
+        # JMa: Prepend dataset name to speaker name. Could be good for multispeaker datasets.
+        if item["speaker_name"] != dataset_name and not item["speaker_name"].startswith(dataset_name+"_"):
+            item["speaker_name"] = f'{dataset_name}_{item["speaker_name"]}'
         # add unique audio name
         relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
         audio_unique_name = f"{dataset_name}#{relfilepath}"
diff --git a/TTS/tts/datasets/formatters.py b/TTS/tts/datasets/formatters.py
index 2ea75ec3..113fc8d1 100644
--- a/TTS/tts/datasets/formatters.py
+++ b/TTS/tts/datasets/formatters.py
@@ -652,11 +652,14 @@ def artic(root_path, meta_file, **kwargs):  # pylint: disable=unused-argument
     items = []
     # Speaker name is the name of the directory with the data (last part of `root_path`)
     speaker_name = os.path.basename(os.path.normpath(root_path))
-    # Speaker name can consists of language code (eg. cs-CZ) and gender (m/f) separated by dots
-    # Example: AndJa.cs-CZ.m
-    parts = speaker_name.split(".")
-    lang = parts[1] if len(parts) == 3 and "-" in parts[1] else None
-    print(f" > ARTIC dataset: voice {parts[0]}, language {lang}")
+    # Speaker name can consists of language code (eg. cs-CZ or en) and gender (m/f) separated by dots
+    # Example: AndJa.cs-CZ.m, LJS.en.f
+    try:
+        voice, lang, sex = speaker_name.split(".")
+    except ValueError:
+        voice = speaker_name
+        lang, sex = None, None
+    print(f" > ARTIC dataset: voice={voice}, sex={sex}, language={lang}")
     with open(txt_file, "r", encoding="utf-8") as ttf:
         for line in ttf:
             # Check the number of standard separators

From d391eea73377be4bd57f9a25b5e857474a23faa9 Mon Sep 17 00:00:00 2001
From: Jindrich Matousek <jindrich.matousek@gmail.com>
Date: Tue, 12 Sep 2023 10:57:37 +0200
Subject: [PATCH 32/32] Fix adding dataset name to speaker name Print speaker
 name notification

---
 TTS/tts/datasets/__init__.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/TTS/tts/datasets/__init__.py b/TTS/tts/datasets/__init__.py
index 02434917..41d767a0 100644
--- a/TTS/tts/datasets/__init__.py
+++ b/TTS/tts/datasets/__init__.py
@@ -57,17 +57,23 @@ def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
 
 
 def add_extra_keys(metadata, language, dataset_name):
+    changes = {}
     for item in metadata:
         # JMa: Add language name only if not defined at the sample level. Could  be good for multi-language datasets.
         if not item["language"]:
             item["language"] = language
         # JMa: Prepend dataset name to speaker name. Could be good for multispeaker datasets.
-        if item["speaker_name"] != dataset_name and not item["speaker_name"].startswith(dataset_name+"_"):
+        if dataset_name and item["speaker_name"] != dataset_name and not item["speaker_name"].startswith(dataset_name+"_"):
+            changes[item["speaker_name"]] = f'{dataset_name}_{item["speaker_name"]}'
             item["speaker_name"] = f'{dataset_name}_{item["speaker_name"]}'
         # add unique audio name
         relfilepath = os.path.splitext(os.path.relpath(item["audio_file"], item["root_path"]))[0]
         audio_unique_name = f"{dataset_name}#{relfilepath}"
         item["audio_unique_name"] = audio_unique_name
+    # JMa: print changed speaker names if any
+    if changes:
+        for k, v in changes.items():
+            print(f" | > speaker name changed: {k} --> {v}")
     return metadata