From f9199b04c464f6949110b61dc548cd4495a420a1 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Mon, 4 Jul 2022 14:45:46 -0300
Subject: [PATCH] Bug fix on external embeddings training

---
 TTS/tts/models/vits.py                        | 45 ++++++++++---------
 ...ts_d_vector_with_external_emotion_train.py |  9 +++-
 2 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 98beb97a..9597c78e 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -1,5 +1,6 @@
 import math
 import os
+import traceback
 import numpy as np
 import pyworld as pw
 from dataclasses import dataclass, field, replace
@@ -312,7 +313,7 @@ class VitsDataset(TTSDataset):
             "wav_file": wav_filename,
             "speaker_name": item["speaker_name"],
             "language_name": item["language"],
-            "emotion_name": item["emotion_name"],
+            "emotion_name": item["emotion_name"] if "emotion_name" in item else None,
             "pitch": f0,
             "alignments": alignments,
 
@@ -1857,7 +1858,6 @@ class Vits(BaseTTS):
         Returns:
             Tuple[Dict, Dict]: Model ouputs and computed losses.
         """
-
         self._freeze_layers()
 
         spec_lens = batch["spec_lens"]
@@ -2163,24 +2163,29 @@ class Vits(BaseTTS):
         test_sentences = self.config.test_sentences
         for idx, s_info in enumerate(test_sentences):
             aux_inputs = self.get_aux_input_from_test_sentences(s_info)
-            wav, alignment, _, _ = synthesis(
-                self,
-                aux_inputs["text"],
-                self.config,
-                "cuda" in str(next(self.parameters()).device),
-                speaker_id=aux_inputs["speaker_id"],
-                d_vector=aux_inputs["d_vector"],
-                style_wav=aux_inputs["style_wav"],
-                language_id=aux_inputs["language_id"],
-                emotion_embedding=aux_inputs["emotion_embedding"],
-                emotion_id=aux_inputs["emotion_ids"],
-                style_speaker_id=aux_inputs["style_speaker_id"],
-                style_speaker_d_vector=aux_inputs["style_speaker_d_vector"],
-                use_griffin_lim=True,
-                do_trim_silence=False,
-            ).values()
-            test_audios["{}-audio".format(idx)] = wav
-            test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)
+            try:
+                wav, alignment, _, _ = synthesis(
+                    self,
+                    aux_inputs["text"],
+                    self.config,
+                    "cuda" in str(next(self.parameters()).device),
+                    speaker_id=aux_inputs["speaker_id"],
+                    d_vector=aux_inputs["d_vector"],
+                    style_wav=aux_inputs["style_wav"],
+                    language_id=aux_inputs["language_id"],
+                    emotion_embedding=aux_inputs["emotion_embedding"],
+                    emotion_id=aux_inputs["emotion_ids"],
+                    style_speaker_id=aux_inputs["style_speaker_id"],
+                    style_speaker_d_vector=aux_inputs["style_speaker_d_vector"],
+                    use_griffin_lim=True,
+                    do_trim_silence=False,
+                ).values()
+                test_audios["{}-audio".format(idx)] = wav
+                test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)
+            except:
+                print("Error during the synthesis of the sentence:", aux_inputs)
+                traceback.print_exc()
+
         return {"figures": test_figures, "audios": test_audios}
 
     def test_log(
diff --git a/tests/tts_tests/test_vits_d_vector_with_external_emotion_train.py b/tests/tts_tests/test_vits_d_vector_with_external_emotion_train.py
index e5046826..9fccfedf 100644
--- a/tests/tts_tests/test_vits_d_vector_with_external_emotion_train.py
+++ b/tests/tts_tests/test_vits_d_vector_with_external_emotion_train.py
@@ -26,6 +26,9 @@ config = VitsConfig(
     print_step=1,
     print_eval=True,
     test_sentences=[
+        ["There", "ljspeech-1", None, None, "ljspeech-1"],
+        ["To access Overdub Stock Voices, first make sure you’ve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You don’t need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all!","ljspeech-1", None, None, "ljspeech-1"],
+        ["To access Overdub Stock Voices, first make sure you’ve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You don’t need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all. To access Overdub Stock Voices, first make sure you’ve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You don’t need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all!", "ljspeech-2", None, None, "ljspeech-2"],
         ["Be a voice, not an echo.", "ljspeech-1", None, None, "ljspeech-1"],
     ],
 )
@@ -45,9 +48,13 @@ config.model_args.d_vector_dim = 256
 config.model_args.use_external_emotions_embeddings = True
 config.model_args.use_emotion_embedding = False
 config.model_args.emotion_embedding_dim = 256
-config.model_args.emotion_just_encoder = True
 config.model_args.external_emotions_embs_file = "tests/data/ljspeech/speakers.json"
+config.model_args.condition_dp_on_speaker = False
 config.use_style_weighted_sampler = True
+
+config.mixed_precision = True
+config.cudnn_benchmark = True
+
 # consistency loss
 # config.model_args.use_emotion_encoder_as_loss = True
 # config.model_args.encoder_model_path = "/raid/edresson/dev/Checkpoints/Coqui-Realesead/tts_models--multilingual--multi-dataset--your_tts/model_se.pth.tar"