Bug fix on external embeddings training

2022-07-04 14:45:46 -03:00 · 2022-07-04 14:45:46 -03:00 · f9199b04c4
parent 65222b1f8c
commit f9199b04c4
2 changed files with 33 additions and 21 deletions
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -1,5 +1,6 @@
 import math
 import os
 import traceback
 import numpy as np
 import pyworld as pw
 from dataclasses import dataclass, field, replace
@ -312,7 +313,7 @@ class VitsDataset(TTSDataset):
            "wav_file": wav_filename,
            "speaker_name": item["speaker_name"],
            "language_name": item["language"],
-            "emotion_name": item["emotion_name"],
+            "emotion_name": item["emotion_name"] if "emotion_name" in item else None,
            "pitch": f0,
            "alignments": alignments,
@ -1857,7 +1858,6 @@ class Vits(BaseTTS):
        Returns:
            Tuple[Dict, Dict]: Model ouputs and computed losses.
        """
        self._freeze_layers()
        spec_lens = batch["spec_lens"]
@ -2163,24 +2163,29 @@ class Vits(BaseTTS):
        test_sentences = self.config.test_sentences
        for idx, s_info in enumerate(test_sentences):
            aux_inputs = self.get_aux_input_from_test_sentences(s_info)
-            wav, alignment, _, _ = synthesis(
+            try:
-                self,
+                wav, alignment, _, _ = synthesis(
-                aux_inputs["text"],
+                    self,
-                self.config,
+                    aux_inputs["text"],
-                "cuda" in str(next(self.parameters()).device),
+                    self.config,
-                speaker_id=aux_inputs["speaker_id"],
+                    "cuda" in str(next(self.parameters()).device),
-                d_vector=aux_inputs["d_vector"],
+                    speaker_id=aux_inputs["speaker_id"],
-                style_wav=aux_inputs["style_wav"],
+                    d_vector=aux_inputs["d_vector"],
-                language_id=aux_inputs["language_id"],
+                    style_wav=aux_inputs["style_wav"],
-                emotion_embedding=aux_inputs["emotion_embedding"],
+                    language_id=aux_inputs["language_id"],
-                emotion_id=aux_inputs["emotion_ids"],
+                    emotion_embedding=aux_inputs["emotion_embedding"],
-                style_speaker_id=aux_inputs["style_speaker_id"],
+                    emotion_id=aux_inputs["emotion_ids"],
-                style_speaker_d_vector=aux_inputs["style_speaker_d_vector"],
+                    style_speaker_id=aux_inputs["style_speaker_id"],
-                use_griffin_lim=True,
+                    style_speaker_d_vector=aux_inputs["style_speaker_d_vector"],
-                do_trim_silence=False,
+                    use_griffin_lim=True,
-            ).values()
+                    do_trim_silence=False,
-            test_audios["{}-audio".format(idx)] = wav
+                ).values()
-            test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)
+                test_audios["{}-audio".format(idx)] = wav
                test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)
            except:
                print("Error during the synthesis of the sentence:", aux_inputs)
                traceback.print_exc()
        return {"figures": test_figures, "audios": test_audios}
    def test_log(
--- a/tests/tts_tests/test_vits_d_vector_with_external_emotion_train.py
+++ b/tests/tts_tests/test_vits_d_vector_with_external_emotion_train.py
@ -26,6 +26,9 @@ config = VitsConfig(
    print_step=1,
    print_eval=True,
    test_sentences=[
        ["There", "ljspeech-1", None, None, "ljspeech-1"],
        ["To access Overdub Stock Voices, first make sure you’ve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You don’t need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all!","ljspeech-1", None, None, "ljspeech-1"],
        ["To access Overdub Stock Voices, first make sure you’ve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You don’t need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all. To access Overdub Stock Voices, first make sure you’ve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You don’t need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all!", "ljspeech-2", None, None, "ljspeech-2"],
        ["Be a voice, not an echo.", "ljspeech-1", None, None, "ljspeech-1"],
    ],
 )
@ -45,9 +48,13 @@ config.model_args.d_vector_dim = 256
 config.model_args.use_external_emotions_embeddings = True
 config.model_args.use_emotion_embedding = False
 config.model_args.emotion_embedding_dim = 256
 config.model_args.emotion_just_encoder = True
 config.model_args.external_emotions_embs_file = "tests/data/ljspeech/speakers.json"
 config.model_args.condition_dp_on_speaker = False
 config.use_style_weighted_sampler = True
 config.mixed_precision = True
 config.cudnn_benchmark = True
 # consistency loss
 # config.model_args.use_emotion_encoder_as_loss = True
 # config.model_args.encoder_model_path = "/raid/edresson/dev/Checkpoints/Coqui-Realesead/tts_models--multilingual--multi-dataset--your_tts/model_se.pth.tar"