Fastspeech2 (#2073)

* added EnergyDataset * add energy to Dataset * add comupte_energy * added energy params * added energy to forward_tts * added plot_avg_energy for visualisation * Update forward_tts.py * create file * added fastspeech2 recipe * add fastspeech2 config * removed energy from fast pitch * add energy loss to forward tts * Update fastspeech2_config.py * change run_name * Update numpy_transforms.py * fix typo * fix typo * fix typo * linting issues * use_energy default value --> False * Update numpy_transforms.py * linting fixes * fix typo * liniting_fix * liniting_fix * fix * fixes * fixes * lint fix * lint fixws * added training test * wrong import * wrong import * trailing whitespace * style fix * changed class name because of error * class name change * class name change * change class name * fixed styles
2023-01-16 03:09:22 +05:30 · 2023-01-16 03:09:22 +05:30 · bc422f2f3c
parent 14d45b5347
commit bc422f2f3c
11 changed files with 866 additions and 6 deletions
--- a/TTS/tts/configs/fast_pitch_config.py
+++ b/TTS/tts/configs/fast_pitch_config.py
@ -100,6 +100,13 @@ class FastPitchConfig(BaseTTSConfig):

        max_seq_len (int):
            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+
+        # dataset configs
+        compute_f0(bool):
+            Compute pitch. defaults to True
+
+        f0_cache_path(str):
+            pith cache path. defaults to None
    """

    model: str = "fast_pitch"
--- a/TTS/tts/configs/fastspeech2_config.py
+++ b/TTS/tts/configs/fastspeech2_config.py
@ -0,0 +1,198 @@
+from dataclasses import dataclass, field
+from typing import List
+
+from TTS.tts.configs.shared_configs import BaseTTSConfig
+from TTS.tts.models.forward_tts import ForwardTTSArgs
+
+
+@dataclass
+class Fastspeech2Config(BaseTTSConfig):
+    """Configure `ForwardTTS` as FastPitch model.
+
+    Example:
+
+        >>> from TTS.tts.configs.fastspeech2_config import FastSpeech2Config
+        >>> config = FastSpeech2Config()
+
+    Args:
+        model (str):
+            Model name used for selecting the right model at initialization. Defaults to `fast_pitch`.
+
+        base_model (str):
+            Name of the base model being configured as this model so that 🐸 TTS knows it needs to initiate
+            the base model rather than searching for the `model` implementation. Defaults to `forward_tts`.
+
+        model_args (Coqpit):
+            Model class arguments. Check `FastPitchArgs` for more details. Defaults to `FastPitchArgs()`.
+
+        data_dep_init_steps (int):
+            Number of steps used for computing normalization parameters at the beginning of the training. GlowTTS uses
+            Activation Normalization that pre-computes normalization stats at the beginning and use the same values
+            for the rest. Defaults to 10.
+
+        speakers_file (str):
+            Path to the file containing the list of speakers. Needed at inference for loading matching speaker ids to
+            speaker names. Defaults to `None`.
+
+        use_speaker_embedding (bool):
+            enable / disable using speaker embeddings for multi-speaker models. If set True, the model is
+            in the multi-speaker mode. Defaults to False.
+
+        use_d_vector_file (bool):
+            enable /disable using external speaker embeddings in place of the learned embeddings. Defaults to False.
+
+        d_vector_file (str):
+            Path to the file including pre-computed speaker embeddings. Defaults to None.
+
+        d_vector_dim (int):
+            Dimension of the external speaker embeddings. Defaults to 0.
+
+        optimizer (str):
+            Name of the model optimizer. Defaults to `Adam`.
+
+        optimizer_params (dict):
+            Arguments of the model optimizer. Defaults to `{"betas": [0.9, 0.998], "weight_decay": 1e-6}`.
+
+        lr_scheduler (str):
+            Name of the learning rate scheduler. Defaults to `Noam`.
+
+        lr_scheduler_params (dict):
+            Arguments of the learning rate scheduler. Defaults to `{"warmup_steps": 4000}`.
+
+        lr (float):
+            Initial learning rate. Defaults to `1e-3`.
+
+        grad_clip (float):
+            Gradient norm clipping value. Defaults to `5.0`.
+
+        spec_loss_type (str):
+            Type of the spectrogram loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
+
+        duration_loss_type (str):
+            Type of the duration loss. Check `ForwardTTSLoss` for possible values. Defaults to `mse`.
+
+        use_ssim_loss (bool):
+            Enable/disable the use of SSIM (Structural Similarity) loss. Defaults to True.
+
+        wd (float):
+            Weight decay coefficient. Defaults to `1e-7`.
+
+        ssim_loss_alpha (float):
+            Weight for the SSIM loss. If set 0, disables the SSIM loss. Defaults to 1.0.
+
+        dur_loss_alpha (float):
+            Weight for the duration predictor's loss. If set 0, disables the huber loss. Defaults to 1.0.
+
+        spec_loss_alpha (float):
+            Weight for the L1 spectrogram loss. If set 0, disables the L1 loss. Defaults to 1.0.
+
+        pitch_loss_alpha (float):
+            Weight for the pitch predictor's loss. If set 0, disables the pitch predictor. Defaults to 1.0.
+
+        energy_loss_alpha (float):
+            Weight for the energy predictor's loss. If set 0, disables the energy predictor. Defaults to 1.0.
+
+        binary_align_loss_alpha (float):
+            Weight for the binary loss. If set 0, disables the binary loss. Defaults to 1.0.
+
+        binary_loss_warmup_epochs (float):
+            Number of epochs to gradually increase the binary loss impact. Defaults to 150.
+
+        min_seq_len (int):
+            Minimum input sequence length to be used at training.
+
+        max_seq_len (int):
+            Maximum input sequence length to be used at training. Larger values result in more VRAM usage.
+
+        # dataset configs
+        compute_f0(bool):
+            Compute pitch. defaults to True
+
+        f0_cache_path(str):
+            pith cache path. defaults to None
+
+        # dataset configs
+        compute_energy(bool):
+            Compute energy. defaults to True
+
+        energy_cache_path(str):
+            energy cache path. defaults to None
+    """
+
+    model: str = "fastspeech2"
+    base_model: str = "forward_tts"
+
+    # model specific params
+    model_args: ForwardTTSArgs = ForwardTTSArgs()
+
+    # multi-speaker settings
+    num_speakers: int = 0
+    speakers_file: str = None
+    use_speaker_embedding: bool = False
+    use_d_vector_file: bool = False
+    d_vector_file: str = False
+    d_vector_dim: int = 0
+
+    # optimizer parameters
+    optimizer: str = "Adam"
+    optimizer_params: dict = field(default_factory=lambda: {"betas": [0.9, 0.998], "weight_decay": 1e-6})
+    lr_scheduler: str = "NoamLR"
+    lr_scheduler_params: dict = field(default_factory=lambda: {"warmup_steps": 4000})
+    lr: float = 1e-4
+    grad_clip: float = 5.0
+
+    # loss params
+    spec_loss_type: str = "mse"
+    duration_loss_type: str = "mse"
+    use_ssim_loss: bool = True
+    ssim_loss_alpha: float = 1.0
+    spec_loss_alpha: float = 1.0
+    aligner_loss_alpha: float = 1.0
+    pitch_loss_alpha: float = 0.1
+    energy_loss_alpha: float = 0.1
+    dur_loss_alpha: float = 0.1
+    binary_align_loss_alpha: float = 0.1
+    binary_loss_warmup_epochs: int = 150
+
+    # overrides
+    min_seq_len: int = 13
+    max_seq_len: int = 200
+    r: int = 1  # DO NOT CHANGE
+
+    # dataset configs
+    compute_f0: bool = True
+    f0_cache_path: str = None
+
+    # dataset configs
+    compute_energy: bool = True
+    energy_cache_path: str = None
+
+    # testing
+    test_sentences: List[str] = field(
+        default_factory=lambda: [
+            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+            "Be a voice, not an echo.",
+            "I'm sorry Dave. I'm afraid I can't do that.",
+            "This cake is great. It's so delicious and moist.",
+            "Prior to November 22, 1963.",
+        ]
+    )
+
+    def __post_init__(self):
+        # Pass multi-speaker parameters to the model args as `model.init_multispeaker()` looks for it there.
+        if self.num_speakers > 0:
+            self.model_args.num_speakers = self.num_speakers
+
+        # speaker embedding settings
+        if self.use_speaker_embedding:
+            self.model_args.use_speaker_embedding = True
+        if self.speakers_file:
+            self.model_args.speakers_file = self.speakers_file
+
+        # d-vector settings
+        if self.use_d_vector_file:
+            self.model_args.use_d_vector_file = True
+        if self.d_vector_dim is not None and self.d_vector_dim > 0:
+            self.model_args.d_vector_dim = self.d_vector_dim
+        if self.d_vector_file:
+            self.model_args.d_vector_file = self.d_vector_file
--- a/TTS/tts/configs/shared_configs.py
+++ b/TTS/tts/configs/shared_configs.py
@ -217,6 +217,9 @@ class BaseTTSConfig(BaseTrainingConfig):
        compute_f0 (int):
            (Not in use yet).

+        compute_energy (int):
+            (Not in use yet).
+
        compute_linear_spec (bool):
            If True data loader computes and returns linear spectrograms alongside the other data.

@ -312,6 +315,7 @@ class BaseTTSConfig(BaseTrainingConfig):
    min_text_len: int = 1
    max_text_len: int = float("inf")
    compute_f0: bool = False
+    compute_energy: bool = False
    compute_linear_spec: bool = False
    precompute_num_workers: int = 0
    use_noise_augment: bool = False
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@ -11,6 +11,7 @@ from torch.utils.data import Dataset

 from TTS.tts.utils.data import prepare_data, prepare_stop_target, prepare_tensor
 from TTS.utils.audio import AudioProcessor
+from TTS.utils.audio.numpy_transforms import compute_energy as calculate_energy

 # to prevent too many open files error as suggested here
 # https://github.com/pytorch/pytorch/issues/11201#issuecomment-421146936
@ -50,7 +51,9 @@ class TTSDataset(Dataset):
        samples: List[Dict] = None,
        tokenizer: "TTSTokenizer" = None,
        compute_f0: bool = False,
+        compute_energy: bool = False,
        f0_cache_path: str = None,
+        energy_cache_path: str = None,
        return_wav: bool = False,
        batch_group_size: int = 0,
        min_text_len: int = 0,
@ -84,8 +87,12 @@ class TTSDataset(Dataset):

            compute_f0 (bool): compute f0 if True. Defaults to False.

+            compute_energy (bool): compute energy if True. Defaults to False.
+
            f0_cache_path (str): Path to store f0 cache. Defaults to None.

+            energy_cache_path (str): Path to store energy cache. Defaults to None.
+
            return_wav (bool): Return the waveform of the sample. Defaults to False.

            batch_group_size (int): Range of batch randomization after sorting
@ -128,7 +135,9 @@ class TTSDataset(Dataset):
        self.compute_linear_spec = compute_linear_spec
        self.return_wav = return_wav
        self.compute_f0 = compute_f0
+        self.compute_energy = compute_energy
        self.f0_cache_path = f0_cache_path
+        self.energy_cache_path = energy_cache_path
        self.min_audio_len = min_audio_len
        self.max_audio_len = max_audio_len
        self.min_text_len = min_text_len
@ -155,7 +164,10 @@ class TTSDataset(Dataset):
            self.f0_dataset = F0Dataset(
                self.samples, self.ap, cache_path=f0_cache_path, precompute_num_workers=precompute_num_workers
            )
-
+        if compute_energy:
+            self.energy_dataset = EnergyDataset(
+                self.samples, self.ap, cache_path=energy_cache_path, precompute_num_workers=precompute_num_workers
+            )
        if self.verbose:
            self.print_logs()

@ -211,6 +223,12 @@ class TTSDataset(Dataset):
        assert item["audio_unique_name"] == out_dict["audio_unique_name"]
        return out_dict

+    def get_energy(self, idx):
+        out_dict = self.energy_dataset[idx]
+        item = self.samples[idx]
+        assert item["audio_unique_name"] == out_dict["audio_unique_name"]
+        return out_dict
+
    @staticmethod
    def get_attn_mask(attn_file):
        return np.load(attn_file)
@ -252,12 +270,16 @@ class TTSDataset(Dataset):
        f0 = None
        if self.compute_f0:
            f0 = self.get_f0(idx)["f0"]
+        energy = None
+        if self.compute_energy:
+            energy = self.get_energy(idx)["energy"]

        sample = {
            "raw_text": raw_text,
            "token_ids": token_ids,
            "wav": wav,
            "pitch": f0,
+            "energy": energy,
            "attn": attn,
            "item_idx": item["audio_file"],
            "speaker_name": item["speaker_name"],
@ -490,7 +512,13 @@ class TTSDataset(Dataset):
                pitch = torch.FloatTensor(pitch)[:, None, :].contiguous()  # B x 1 xT
            else:
                pitch = None
-
+            # format energy
+            if self.compute_energy:
+                energy = prepare_data(batch["energy"])
+                assert mel.shape[1] == energy.shape[1], f"[!] {mel.shape} vs {energy.shape}"
+                energy = torch.FloatTensor(energy)[:, None, :].contiguous()  # B x 1 xT
+            else:
+                energy = None
            # format attention masks
            attns = None
            if batch["attn"][0] is not None:
@ -519,6 +547,7 @@ class TTSDataset(Dataset):
                "waveform": wav_padded,
                "raw_text": batch["raw_text"],
                "pitch": pitch,
+                "energy": energy,
                "language_ids": language_ids,
                "audio_unique_names": batch["audio_unique_name"],
            }
@ -777,3 +806,155 @@ class F0Dataset:
        print("\n")
        print(f"{indent}> F0Dataset ")
        print(f"{indent}| > Number of instances : {len(self.samples)}")
+
+
+class EnergyDataset:
+    """Energy Dataset for computing Energy from wav files in CPU
+
+    Pre-compute Energy values for all the samples at initialization if `cache_path` is not None or already present. It
+    also computes the mean and std of Energy values if `normalize_Energy` is True.
+
+    Args:
+        samples (Union[List[List], List[Dict]]):
+            List of samples. Each sample is a list or a dict.
+
+        ap (AudioProcessor):
+            AudioProcessor to compute Energy from wav files.
+
+        cache_path (str):
+            Path to cache Energy values. If `cache_path` is already present or None, it skips the pre-computation.
+            Defaults to None.
+
+        precompute_num_workers (int):
+            Number of workers used for pre-computing the Energy values. Defaults to 0.
+
+        normalize_Energy (bool):
+            Whether to normalize Energy values by mean and std. Defaults to True.
+    """
+
+    def __init__(
+        self,
+        samples: Union[List[List], List[Dict]],
+        ap: "AudioProcessor",
+        verbose=False,
+        cache_path: str = None,
+        precompute_num_workers=0,
+        normalize_energy=True,
+    ):
+        self.samples = samples
+        self.ap = ap
+        self.verbose = verbose
+        self.cache_path = cache_path
+        self.normalize_energy = normalize_energy
+        self.pad_id = 0.0
+        self.mean = None
+        self.std = None
+        if cache_path is not None and not os.path.exists(cache_path):
+            os.makedirs(cache_path)
+            self.precompute(precompute_num_workers)
+        if normalize_energy:
+            self.load_stats(cache_path)
+
+    def __getitem__(self, idx):
+        item = self.samples[idx]
+        energy = self.compute_or_load(item["audio_file"])
+        if self.normalize_energy:
+            assert self.mean is not None and self.std is not None, " [!] Mean and STD is not available"
+            energy = self.normalize(energy)
+        return {"audio_file": item["audio_file"], "energy": energy}
+
+    def __len__(self):
+        return len(self.samples)
+
+    def precompute(self, num_workers=0):
+        print("[*] Pre-computing energys...")
+        with tqdm.tqdm(total=len(self)) as pbar:
+            batch_size = num_workers if num_workers > 0 else 1
+            # we do not normalize at preproessing
+            normalize_energy = self.normalize_energy
+            self.normalize_energy = False
+            dataloder = torch.utils.data.DataLoader(
+                batch_size=batch_size, dataset=self, shuffle=False, num_workers=num_workers, collate_fn=self.collate_fn
+            )
+            computed_data = []
+            for batch in dataloder:
+                energy = batch["energy"]
+                computed_data.append(e for e in energy)
+                pbar.update(batch_size)
+            self.normalize_energy = normalize_energy
+
+        if self.normalize_energy:
+            computed_data = [tensor for batch in computed_data for tensor in batch]  # flatten
+            energy_mean, energy_std = self.compute_pitch_stats(computed_data)
+            energy_stats = {"mean": energy_mean, "std": energy_std}
+            np.save(os.path.join(self.cache_path, "energy_stats"), energy_stats, allow_pickle=True)
+
+    def get_pad_id(self):
+        return self.pad_id
+
+    @staticmethod
+    def create_energy_file_path(wav_file, cache_path):
+        file_name = os.path.splitext(os.path.basename(wav_file))[0]
+        energy_file = os.path.join(cache_path, file_name + "_energy.npy")
+        return energy_file
+
+    @staticmethod
+    def _compute_and_save_energy(ap, wav_file, energy_file=None):
+        wav = ap.load_wav(wav_file)
+        energy = calculate_energy(wav)
+        if energy_file:
+            np.save(energy_file, energy)
+        return energy
+
+    @staticmethod
+    def compute_energy_stats(energy_vecs):
+        nonzeros = np.concatenate([v[np.where(v != 0.0)[0]] for v in energy_vecs])
+        mean, std = np.mean(nonzeros), np.std(nonzeros)
+        return mean, std
+
+    def load_stats(self, cache_path):
+        stats_path = os.path.join(cache_path, "energy_stats.npy")
+        stats = np.load(stats_path, allow_pickle=True).item()
+        self.mean = stats["mean"].astype(np.float32)
+        self.std = stats["std"].astype(np.float32)
+
+    def normalize(self, energy):
+        zero_idxs = np.where(energy == 0.0)[0]
+        energy = energy - self.mean
+        energy = energy / self.std
+        energy[zero_idxs] = 0.0
+        return energy
+
+    def denormalize(self, energy):
+        zero_idxs = np.where(energy == 0.0)[0]
+        energy *= self.std
+        energy += self.mean
+        energy[zero_idxs] = 0.0
+        return energy
+
+    def compute_or_load(self, wav_file):
+        """
+        compute energy and return a numpy array of energy values
+        """
+        energy_file = self.create_Energy_file_path(wav_file, self.cache_path)
+        if not os.path.exists(energy_file):
+            energy = self._compute_and_save_energy(self.ap, wav_file, energy_file)
+        else:
+            energy = np.load(energy_file)
+        return energy.astype(np.float32)
+
+    def collate_fn(self, batch):
+        audio_file = [item["audio_file"] for item in batch]
+        energys = [item["energy"] for item in batch]
+        energy_lens = [len(item["energy"]) for item in batch]
+        energy_lens_max = max(energy_lens)
+        energys_torch = torch.LongTensor(len(energys), energy_lens_max).fill_(self.get_pad_id())
+        for i, energy_len in enumerate(energy_lens):
+            energys_torch[i, :energy_len] = torch.LongTensor(energys[i])
+        return {"audio_file": audio_file, "energy": energys_torch, "energy_lens": energy_lens}
+
+    def print_logs(self, level: int = 0) -> None:
+        indent = "\t" * level
+        print("\n")
+        print(f"{indent}> energyDataset ")
+        print(f"{indent}| > Number of instances : {len(self.samples)}")
--- a/TTS/tts/layers/losses.py
+++ b/TTS/tts/layers/losses.py
@ -801,6 +801,10 @@ class ForwardTTSLoss(nn.Module):
            self.pitch_loss = MSELossMasked(False)
            self.pitch_loss_alpha = c.pitch_loss_alpha

+        if c.model_args.use_energy:
+            self.energy_loss = MSELossMasked(False)
+            self.energy_loss_alpha = c.energy_loss_alpha
+
        if c.use_ssim_loss:
            self.ssim = SSIMLoss() if c.use_ssim_loss else None
            self.ssim_loss_alpha = c.ssim_loss_alpha
@ -826,6 +830,8 @@ class ForwardTTSLoss(nn.Module):
        dur_target,
        pitch_output,
        pitch_target,
+        energy_output,
+        energy_target,
        input_lens,
        alignment_logprob=None,
        alignment_hard=None,
@ -855,6 +861,11 @@ class ForwardTTSLoss(nn.Module):
            loss = loss + self.pitch_loss_alpha * pitch_loss
            return_dict["loss_pitch"] = self.pitch_loss_alpha * pitch_loss

+        if hasattr(self, "energy_loss") and self.energy_loss_alpha > 0:
+            energy_loss = self.energy_loss(energy_output.transpose(1, 2), energy_target.transpose(1, 2), input_lens)
+            loss = loss + self.energy_loss_alpha * energy_loss
+            return_dict["loss_energy"] = self.energy_loss_alpha * energy_loss
+
        if hasattr(self, "aligner_loss") and self.aligner_loss_alpha > 0:
            aligner_loss = self.aligner_loss(alignment_logprob, input_lens, decoder_output_lens)
            loss = loss + self.aligner_loss_alpha * aligner_loss
--- a/TTS/tts/models/forward_tts.py
+++ b/TTS/tts/models/forward_tts.py
@ -15,7 +15,7 @@ from TTS.tts.models.base_tts import BaseTTS
 from TTS.tts.utils.helpers import average_over_durations, generate_path, maximum_path, sequence_mask
 from TTS.tts.utils.speakers import SpeakerManager
 from TTS.tts.utils.text.tokenizer import TTSTokenizer
-from TTS.tts.utils.visual import plot_alignment, plot_avg_pitch, plot_spectrogram
+from TTS.tts.utils.visual import plot_alignment, plot_avg_energy, plot_avg_pitch, plot_spectrogram
 from TTS.utils.io import load_fsspec


@ -42,6 +42,9 @@ class ForwardTTSArgs(Coqpit):
        use_pitch (bool):
            Use pitch predictor to learn the pitch. Defaults to True.

+        use_energy (bool):
+            Use energy predictor to learn the energy. Defaults to True.
+
        duration_predictor_hidden_channels (int):
            Number of hidden channels in the duration predictor. Defaults to 256.

@ -63,6 +66,18 @@ class ForwardTTSArgs(Coqpit):
        pitch_embedding_kernel_size (int):
            Kernel size of the projection layer in the pitch predictor. Defaults to 3.

+        energy_predictor_hidden_channels (int):
+            Number of hidden channels in the energy predictor. Defaults to 256.
+
+        energy_predictor_dropout_p (float):
+            Dropout rate for the energy predictor. Defaults to 0.1.
+
+        energy_predictor_kernel_size (int):
+            Kernel size of conv layers in the energy predictor. Defaults to 3.
+
+        energy_embedding_kernel_size (int):
+            Kernel size of the projection layer in the energy predictor. Defaults to 3.
+
        positional_encoding (bool):
            Whether to use positional encoding. Defaults to True.

@ -114,14 +129,25 @@ class ForwardTTSArgs(Coqpit):
    out_channels: int = 80
    hidden_channels: int = 384
    use_aligner: bool = True
+    # pitch params
    use_pitch: bool = True
    pitch_predictor_hidden_channels: int = 256
    pitch_predictor_kernel_size: int = 3
    pitch_predictor_dropout_p: float = 0.1
    pitch_embedding_kernel_size: int = 3
+
+    # energy params
+    use_energy: bool = False
+    energy_predictor_hidden_channels: int = 256
+    energy_predictor_kernel_size: int = 3
+    energy_predictor_dropout_p: float = 0.1
+    energy_embedding_kernel_size: int = 3
+
+    # duration params
    duration_predictor_hidden_channels: int = 256
    duration_predictor_kernel_size: int = 3
    duration_predictor_dropout_p: float = 0.1
+
    positional_encoding: bool = True
    poisitonal_encoding_use_scale: bool = True
    length_scale: int = 1
@ -158,7 +184,7 @@ class ForwardTTS(BaseTTS):
        - FastPitch
        - SpeedySpeech
        - FastSpeech
-        - TODO: FastSpeech2 (requires average speech energy predictor)
+        - FastSpeech2 (requires average speech energy predictor)

    Args:
        config (Coqpit): Model coqpit class.
@ -187,6 +213,7 @@ class ForwardTTS(BaseTTS):
        self.max_duration = self.args.max_duration
        self.use_aligner = self.args.use_aligner
        self.use_pitch = self.args.use_pitch
+        self.use_energy = self.args.use_energy
        self.binary_loss_weight = 0.0

        self.length_scale = (
@ -234,6 +261,20 @@ class ForwardTTS(BaseTTS):
                padding=int((self.args.pitch_embedding_kernel_size - 1) / 2),
            )

+        if self.args.use_energy:
+            self.energy_predictor = DurationPredictor(
+                self.args.hidden_channels + self.embedded_speaker_dim,
+                self.args.energy_predictor_hidden_channels,
+                self.args.energy_predictor_kernel_size,
+                self.args.energy_predictor_dropout_p,
+            )
+            self.energy_emb = nn.Conv1d(
+                1,
+                self.args.hidden_channels,
+                kernel_size=self.args.energy_embedding_kernel_size,
+                padding=int((self.args.energy_embedding_kernel_size - 1) / 2),
+            )
+
        if self.args.use_aligner:
            self.aligner = AlignmentNetwork(
                in_query_channels=self.args.out_channels, in_key_channels=self.args.hidden_channels
@ -440,6 +481,42 @@ class ForwardTTS(BaseTTS):
        o_pitch_emb = self.pitch_emb(o_pitch)
        return o_pitch_emb, o_pitch

+    def _forward_energy_predictor(
+        self,
+        o_en: torch.FloatTensor,
+        x_mask: torch.IntTensor,
+        energy: torch.FloatTensor = None,
+        dr: torch.IntTensor = None,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        """Energy predictor forward pass.
+
+        1. Predict energy from encoder outputs.
+        2. In training - Compute average pitch values for each input character from the ground truth pitch values.
+        3. Embed average energy values.
+
+        Args:
+            o_en (torch.FloatTensor): Encoder output.
+            x_mask (torch.IntTensor): Input sequence mask.
+            energy (torch.FloatTensor, optional): Ground truth energy values. Defaults to None.
+            dr (torch.IntTensor, optional): Ground truth durations. Defaults to None.
+
+        Returns:
+            Tuple[torch.FloatTensor, torch.FloatTensor]: Energy embedding, energy prediction.
+
+        Shapes:
+            - o_en: :math:`(B, C, T_{en})`
+            - x_mask: :math:`(B, 1, T_{en})`
+            - pitch: :math:`(B, 1, T_{de})`
+            - dr: :math:`(B, T_{en})`
+        """
+        o_energy = self.energy_predictor(o_en, x_mask)
+        if energy is not None:
+            avg_energy = average_over_durations(energy, dr)
+            o_energy_emb = self.energy_emb(avg_energy)
+            return o_energy_emb, o_energy, avg_energy
+        o_energy_emb = self.energy_emb(o_energy)
+        return o_energy_emb, o_energy
+
    def _forward_aligner(
        self, x: torch.FloatTensor, y: torch.FloatTensor, x_mask: torch.IntTensor, y_mask: torch.IntTensor
    ) -> Tuple[torch.IntTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
@ -502,6 +579,7 @@ class ForwardTTS(BaseTTS):
        y: torch.FloatTensor = None,
        dr: torch.IntTensor = None,
        pitch: torch.FloatTensor = None,
+        energy: torch.FloatTensor = None,
        aux_input: Dict = {"d_vectors": None, "speaker_ids": None},  # pylint: disable=unused-argument
    ) -> Dict:
        """Model's forward pass.
@ -513,6 +591,7 @@ class ForwardTTS(BaseTTS):
            y (torch.FloatTensor): Spectrogram frames. Only used when the alignment network is on. Defaults to None.
            dr (torch.IntTensor): Character durations over the spectrogram frames. Only used when the alignment network is off. Defaults to None.
            pitch (torch.FloatTensor): Pitch values for each spectrogram frame. Only used when the pitch predictor is on. Defaults to None.
+            energy (torch.FloatTensor): energy values for each spectrogram frame. Only used when the energy predictor is on. Defaults to None.
            aux_input (Dict): Auxiliary model inputs for multi-speaker training. Defaults to `{"d_vectors": 0, "speaker_ids": None}`.

        Shapes:
@ -556,6 +635,12 @@ class ForwardTTS(BaseTTS):
        if self.args.use_pitch:
            o_pitch_emb, o_pitch, avg_pitch = self._forward_pitch_predictor(o_en, x_mask, pitch, dr)
            o_en = o_en + o_pitch_emb
+        # energy predictor pass
+        o_energy = None
+        avg_energy = None
+        if self.args.use_energy:
+            o_energy_emb, o_energy, avg_energy = self._forward_energy_predictor(o_en, x_mask, energy, dr)
+            o_en = o_en + o_energy_emb
        # decoder pass
        o_de, attn = self._forward_decoder(
            o_en, dr, x_mask, y_lengths, g=None
@ -567,6 +652,8 @@ class ForwardTTS(BaseTTS):
            "attn_durations": o_attn,  # for visualization [B, T_en, T_de']
            "pitch_avg": o_pitch,
            "pitch_avg_gt": avg_pitch,
+            "energy_avg": o_energy,
+            "energy_avg_gt": avg_energy,
            "alignments": attn,  # [B, T_de, T_en]
            "alignment_soft": alignment_soft,
            "alignment_mas": alignment_mas,
@ -604,12 +691,18 @@ class ForwardTTS(BaseTTS):
        if self.args.use_pitch:
            o_pitch_emb, o_pitch = self._forward_pitch_predictor(o_en, x_mask)
            o_en = o_en + o_pitch_emb
+        # energy predictor pass
+        o_energy = None
+        if self.args.use_energy:
+            o_energy_emb, o_energy = self._forward_energy_predictor(o_en, x_mask)
+            o_en = o_en + o_energy_emb
        # decoder pass
        o_de, attn = self._forward_decoder(o_en, o_dr, x_mask, y_lengths, g=None)
        outputs = {
            "model_outputs": o_de,
            "alignments": attn,
            "pitch": o_pitch,
+            "energy": o_energy,
            "durations_log": o_dr_log,
        }
        return outputs
@ -620,6 +713,7 @@ class ForwardTTS(BaseTTS):
        mel_input = batch["mel_input"]
        mel_lengths = batch["mel_lengths"]
        pitch = batch["pitch"] if self.args.use_pitch else None
+        energy = batch["energy"] if self.args.use_energy else None
        d_vectors = batch["d_vectors"]
        speaker_ids = batch["speaker_ids"]
        durations = batch["durations"]
@ -627,7 +721,14 @@ class ForwardTTS(BaseTTS):

        # forward pass
        outputs = self.forward(
-            text_input, text_lengths, mel_lengths, y=mel_input, dr=durations, pitch=pitch, aux_input=aux_input
+            text_input,
+            text_lengths,
+            mel_lengths,
+            y=mel_input,
+            dr=durations,
+            pitch=pitch,
+            energy=energy,
+            aux_input=aux_input,
        )
        # use aligner's output as the duration target
        if self.use_aligner:
@ -643,6 +744,8 @@ class ForwardTTS(BaseTTS):
                dur_target=durations,
                pitch_output=outputs["pitch_avg"] if self.use_pitch else None,
                pitch_target=outputs["pitch_avg_gt"] if self.use_pitch else None,
+                energy_output=outputs["energy_avg"] if self.use_energy else None,
+                energy_target=outputs["energy_avg_gt"] if self.use_energy else None,
                input_lens=text_lengths,
                alignment_logprob=outputs["alignment_logprob"] if self.use_aligner else None,
                alignment_soft=outputs["alignment_soft"],
@ -683,6 +786,17 @@ class ForwardTTS(BaseTTS):
            }
            figures.update(pitch_figures)

+        # plot energy figures
+        if self.args.use_energy:
+            energy_avg = abs(outputs["energy_avg_gt"][0, 0].data.cpu().numpy())
+            energy_avg_hat = abs(outputs["energy_avg"][0, 0].data.cpu().numpy())
+            chars = self.tokenizer.decode(batch["text_input"][0].data.cpu().numpy())
+            energy_figures = {
+                "energy_ground_truth": plot_avg_energy(energy_avg, chars, output_fig=False),
+                "energy_avg_predicted": plot_avg_energy(energy_avg_hat, chars, output_fig=False),
+            }
+            figures.update(energy_figures)
+
        # plot the attention mask computed from the predicted durations
        if "attn_durations" in outputs:
            alignments_hat = outputs["attn_durations"][0].data.cpu().numpy()
--- a/TTS/tts/utils/visual.py
+++ b/TTS/tts/utils/visual.py
@ -123,6 +123,39 @@ def plot_avg_pitch(pitch, chars, fig_size=(30, 10), output_fig=False):
    return fig


+def plot_avg_energy(energy, chars, fig_size=(30, 10), output_fig=False):
+    """Plot energy curves on top of the input characters.
+
+    Args:
+        energy (np.array): energy values.
+        chars (str): Characters to place to the x-axis.
+
+    Shapes:
+        energy: :math:`(T,)`
+    """
+    old_fig_size = plt.rcParams["figure.figsize"]
+    if fig_size is not None:
+        plt.rcParams["figure.figsize"] = fig_size
+
+    fig, ax = plt.subplots()
+
+    x = np.array(range(len(chars)))
+    my_xticks = chars
+    plt.xticks(x, my_xticks)
+
+    ax.set_xlabel("characters")
+    ax.set_ylabel("freq")
+
+    ax2 = ax.twinx()
+    ax2.plot(energy, linewidth=5.0, color="red")
+    ax2.set_ylabel("energy")
+
+    plt.rcParams["figure.figsize"] = old_fig_size
+    if not output_fig:
+        plt.close()
+    return fig
+
+
 def visualize(
    alignment,
    postnet_output,
--- a/TTS/utils/audio/numpy_transforms.py
+++ b/TTS/utils/audio/numpy_transforms.py
@ -4,7 +4,7 @@ import librosa
 import numpy as np
 import scipy
 import soundfile as sf
-from librosa import pyin
+from librosa import magphase, pyin

 # For using kwargs
 # pylint: disable=unused-argument
@ -303,6 +303,27 @@ def compute_f0(
    return f0


+def compute_energy(y: np.ndarray, **kwargs) -> np.ndarray:
+    """Compute energy of a waveform using the same parameters used for computing melspectrogram.
+    Args:
+      x (np.ndarray): Waveform. Shape :math:`[T_wav,]`
+    Returns:
+      np.ndarray: energy. Shape :math:`[T_energy,]`. :math:`T_energy == T_wav / hop_length`
+    Examples:
+      >>> WAV_FILE = filename = librosa.util.example_audio_file()
+      >>> from TTS.config import BaseAudioConfig
+      >>> from TTS.utils.audio import AudioProcessor
+      >>> conf = BaseAudioConfig()
+      >>> ap = AudioProcessor(**conf)
+      >>> wav = ap.load_wav(WAV_FILE, sr=ap.sample_rate)[:5 * ap.sample_rate]
+      >>> energy = ap.compute_energy(wav)
+    """
+    x = stft(y=y, **kwargs)
+    mag, _ = magphase(x)
+    energy = np.sqrt(np.sum(mag**2, axis=0))
+    return energy
+
+
 ### Audio Processing ###
 def find_endpoint(
    *,
--- a/recipes/ljspeech/fastspeech2/train_fastspeech2.py
+++ b/recipes/ljspeech/fastspeech2/train_fastspeech2.py
@ -0,0 +1,102 @@
+import os
+
+from trainer import Trainer, TrainerArgs
+
+from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig
+from TTS.tts.configs.fastspeech2_config import Fastspeech2Config
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.forward_tts import ForwardTTS
+from TTS.tts.utils.text.tokenizer import TTSTokenizer
+from TTS.utils.audio import AudioProcessor
+from TTS.utils.manage import ModelManager
+
+output_path = os.path.dirname(os.path.abspath(__file__))
+
+# init configs
+dataset_config = BaseDatasetConfig(
+    formatter="ljspeech",
+    meta_file_train="metadata.csv",
+    # meta_file_attn_mask=os.path.join(output_path, "../LJSpeech-1.1/metadata_attn_mask.txt"),
+    path=os.path.join(output_path, "../LJSpeech-1.1/"),
+)
+
+audio_config = BaseAudioConfig(
+    sample_rate=22050,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=False,
+    mel_fmin=0.0,
+    mel_fmax=8000,
+    spec_gain=1.0,
+    log_func="np.log",
+    ref_level_db=20,
+    preemphasis=0.0,
+)
+
+config = Fastspeech2Config(
+    run_name="fastspeech2_ljspeech",
+    audio=audio_config,
+    batch_size=32,
+    eval_batch_size=16,
+    num_loader_workers=8,
+    num_eval_loader_workers=4,
+    compute_input_seq_cache=True,
+    compute_f0=True,
+    f0_cache_path=os.path.join(output_path, "f0_cache"),
+    compute_energy=True,
+    energy_cache_path=os.path.join(output_path, "energy_cache"),
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1000,
+    text_cleaner="english_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
+    precompute_num_workers=4,
+    print_step=50,
+    print_eval=False,
+    mixed_precision=False,
+    max_seq_len=500000,
+    output_path=output_path,
+    datasets=[dataset_config],
+)
+
+# compute alignments
+if not config.model_args.use_aligner:
+    manager = ModelManager()
+    model_path, config_path, _ = manager.download_model("tts_models/en/ljspeech/tacotron2-DCA")
+    # TODO: make compute_attention python callable
+    os.system(
+        f"python TTS/bin/compute_attention_masks.py --model_path {model_path} --config_path {config_path} --dataset ljspeech --dataset_metafile metadata.csv --data_path ./recipes/ljspeech/LJSpeech-1.1/  --use_cuda true"
+    )
+
+# INITIALIZE THE AUDIO PROCESSOR
+# Audio processor is used for feature extraction and audio I/O.
+# It mainly serves to the dataloader and the training loggers.
+ap = AudioProcessor.init_from_config(config)
+
+# INITIALIZE THE TOKENIZER
+# Tokenizer is used to convert text to sequences of token IDs.
+# If characters are not defined in the config, default characters are passed to the config
+tokenizer, config = TTSTokenizer.init_from_config(config)
+
+# LOAD DATA SAMPLES
+# Each sample is a list of ```[text, audio_file_path, speaker_name]```
+# You can define your custom sample loader returning the list of samples.
+# Or define your custom formatter and pass it to the `load_tts_samples`.
+# Check `TTS.tts.datasets.load_tts_samples` for more details.
+train_samples, eval_samples = load_tts_samples(
+    dataset_config,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+
+# init the model
+model = ForwardTTS(config, ap, tokenizer, speaker_manager=None)
+
+# init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
+)
+trainer.fit()
--- a/tests/tts_tests/test_fastspeech_2_speaker_emb_train.py
+++ b/tests/tts_tests/test_fastspeech_2_speaker_emb_train.py
@ -0,0 +1,95 @@
+import glob
+import json
+import os
+import shutil
+
+from trainer import get_last_checkpoint
+
+from tests import get_device_id, get_tests_output_path, run_cli
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.tts.configs.fastspeech2_config import Fastspeech2Config
+
+config_path = os.path.join(get_tests_output_path(), "fast_pitch_speaker_emb_config.json")
+output_path = os.path.join(get_tests_output_path(), "train_outputs")
+
+audio_config = BaseAudioConfig(
+    sample_rate=22050,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=False,
+    mel_fmin=0.0,
+    mel_fmax=8000,
+    spec_gain=1.0,
+    log_func="np.log",
+    ref_level_db=20,
+    preemphasis=0.0,
+)
+
+config = Fastspeech2Config(
+    audio=audio_config,
+    batch_size=8,
+    eval_batch_size=8,
+    num_loader_workers=0,
+    num_eval_loader_workers=0,
+    text_cleaner="english_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
+    f0_cache_path="tests/data/ljspeech/f0_cache/",
+    compute_f0=True,
+    compute_energy=True,
+    energy_cache_path="tests/data/ljspeech/f0_cache/",
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1,
+    print_step=1,
+    print_eval=True,
+    use_speaker_embedding=True,
+    test_sentences=[
+        "Be a voice, not an echo.",
+    ],
+)
+config.audio.do_trim_silence = True
+config.use_speaker_embedding = True
+config.model_args.use_speaker_embedding = True
+config.audio.trim_db = 60
+config.save_json(config_path)
+
+# train the model for one epoch
+command_train = (
+    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+    f"--coqpit.output_path {output_path} "
+    "--coqpit.datasets.0.formatter ljspeech_test "
+    "--coqpit.datasets.0.meta_file_train metadata.csv "
+    "--coqpit.datasets.0.meta_file_val metadata.csv "
+    "--coqpit.datasets.0.path tests/data/ljspeech "
+    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+    "--coqpit.test_delay_epochs 0"
+)
+run_cli(command_train)
+
+# Find latest folder
+continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+
+# Inference using TTS API
+continue_config_path = os.path.join(continue_path, "config.json")
+continue_restore_path, _ = get_last_checkpoint(continue_path)
+out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+speaker_id = "ljspeech-1"
+continue_speakers_path = os.path.join(continue_path, "speakers.json")
+
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
+inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+run_cli(inference_command)
+
+# restore the model and continue training for one more epoch
+command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+run_cli(command_train)
+shutil.rmtree(continue_path)
--- a/tests/tts_tests/test_fastspeech_2_train.py
+++ b/tests/tts_tests/test_fastspeech_2_train.py
@ -0,0 +1,94 @@
+import glob
+import json
+import os
+import shutil
+
+from trainer import get_last_checkpoint
+
+from tests import get_device_id, get_tests_output_path, run_cli
+from TTS.config.shared_configs import BaseAudioConfig
+from TTS.tts.configs.fastspeech2_config import Fastspeech2Config
+
+config_path = os.path.join(get_tests_output_path(), "test_model_config.json")
+output_path = os.path.join(get_tests_output_path(), "train_outputs")
+
+audio_config = BaseAudioConfig(
+    sample_rate=22050,
+    do_trim_silence=True,
+    trim_db=60.0,
+    signal_norm=False,
+    mel_fmin=0.0,
+    mel_fmax=8000,
+    spec_gain=1.0,
+    log_func="np.log",
+    ref_level_db=20,
+    preemphasis=0.0,
+)
+
+config = Fastspeech2Config(
+    audio=audio_config,
+    batch_size=8,
+    eval_batch_size=8,
+    num_loader_workers=0,
+    num_eval_loader_workers=0,
+    text_cleaner="english_cleaners",
+    use_phonemes=True,
+    phoneme_language="en-us",
+    phoneme_cache_path="tests/data/ljspeech/phoneme_cache/",
+    f0_cache_path="tests/data/ljspeech/f0_cache/",
+    compute_f0=True,
+    compute_energy=True,
+    energy_cache_path="tests/data/ljspeech/f0_cache/",
+    run_eval=True,
+    test_delay_epochs=-1,
+    epochs=1,
+    print_step=1,
+    print_eval=True,
+    test_sentences=[
+        "Be a voice, not an echo.",
+    ],
+    use_speaker_embedding=False,
+)
+config.audio.do_trim_silence = True
+config.use_speaker_embedding = False
+config.model_args.use_speaker_embedding = False
+config.audio.trim_db = 60
+config.save_json(config_path)
+
+# train the model for one epoch
+command_train = (
+    f"CUDA_VISIBLE_DEVICES='{get_device_id()}'  python TTS/bin/train_tts.py --config_path {config_path}  "
+    f"--coqpit.output_path {output_path} "
+    "--coqpit.datasets.0.formatter ljspeech "
+    "--coqpit.datasets.0.meta_file_train metadata.csv "
+    "--coqpit.datasets.0.meta_file_val metadata.csv "
+    "--coqpit.datasets.0.path tests/data/ljspeech "
+    "--coqpit.datasets.0.meta_file_attn_mask tests/data/ljspeech/metadata_attn_mask.txt "
+    "--coqpit.test_delay_epochs 0"
+)
+
+run_cli(command_train)
+
+# Find latest folder
+continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)
+
+# Inference using TTS API
+continue_config_path = os.path.join(continue_path, "config.json")
+continue_restore_path, _ = get_last_checkpoint(continue_path)
+out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
+
+# Check integrity of the config
+with open(continue_config_path, "r", encoding="utf-8") as f:
+    config_loaded = json.load(f)
+assert config_loaded["characters"] is not None
+assert config_loaded["output_path"] in continue_path
+assert config_loaded["test_delay_epochs"] == 0
+
+# Load the model and run inference
+inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
+run_cli(inference_command)
+
+# restore the model and continue training for one more epoch
+command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_tts.py --continue_path {continue_path} "
+run_cli(command_train)
+shutil.rmtree(continue_path)