From f9199b04c464f6949110b61dc548cd4495a420a1 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Mon, 4 Jul 2022 14:45:46 -0300 Subject: [PATCH] Bug fix on external embeddings training --- TTS/tts/models/vits.py | 45 ++++++++++--------- ...ts_d_vector_with_external_emotion_train.py | 9 +++- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 98beb97a..9597c78e 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1,5 +1,6 @@ import math import os +import traceback import numpy as np import pyworld as pw from dataclasses import dataclass, field, replace @@ -312,7 +313,7 @@ class VitsDataset(TTSDataset): "wav_file": wav_filename, "speaker_name": item["speaker_name"], "language_name": item["language"], - "emotion_name": item["emotion_name"], + "emotion_name": item["emotion_name"] if "emotion_name" in item else None, "pitch": f0, "alignments": alignments, @@ -1857,7 +1858,6 @@ class Vits(BaseTTS): Returns: Tuple[Dict, Dict]: Model ouputs and computed losses. """ - self._freeze_layers() spec_lens = batch["spec_lens"] @@ -2163,24 +2163,29 @@ class Vits(BaseTTS): test_sentences = self.config.test_sentences for idx, s_info in enumerate(test_sentences): aux_inputs = self.get_aux_input_from_test_sentences(s_info) - wav, alignment, _, _ = synthesis( - self, - aux_inputs["text"], - self.config, - "cuda" in str(next(self.parameters()).device), - speaker_id=aux_inputs["speaker_id"], - d_vector=aux_inputs["d_vector"], - style_wav=aux_inputs["style_wav"], - language_id=aux_inputs["language_id"], - emotion_embedding=aux_inputs["emotion_embedding"], - emotion_id=aux_inputs["emotion_ids"], - style_speaker_id=aux_inputs["style_speaker_id"], - style_speaker_d_vector=aux_inputs["style_speaker_d_vector"], - use_griffin_lim=True, - do_trim_silence=False, - ).values() - test_audios["{}-audio".format(idx)] = wav - test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) + try: + wav, alignment, _, _ = synthesis( + self, + aux_inputs["text"], + self.config, + "cuda" in str(next(self.parameters()).device), + speaker_id=aux_inputs["speaker_id"], + d_vector=aux_inputs["d_vector"], + style_wav=aux_inputs["style_wav"], + language_id=aux_inputs["language_id"], + emotion_embedding=aux_inputs["emotion_embedding"], + emotion_id=aux_inputs["emotion_ids"], + style_speaker_id=aux_inputs["style_speaker_id"], + style_speaker_d_vector=aux_inputs["style_speaker_d_vector"], + use_griffin_lim=True, + do_trim_silence=False, + ).values() + test_audios["{}-audio".format(idx)] = wav + test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) + except: + print("Error during the synthesis of the sentence:", aux_inputs) + traceback.print_exc() + return {"figures": test_figures, "audios": test_audios} def test_log( diff --git a/tests/tts_tests/test_vits_d_vector_with_external_emotion_train.py b/tests/tts_tests/test_vits_d_vector_with_external_emotion_train.py index e5046826..9fccfedf 100644 --- a/tests/tts_tests/test_vits_d_vector_with_external_emotion_train.py +++ b/tests/tts_tests/test_vits_d_vector_with_external_emotion_train.py @@ -26,6 +26,9 @@ config = VitsConfig( print_step=1, print_eval=True, test_sentences=[ + ["There", "ljspeech-1", None, None, "ljspeech-1"], + ["To access Overdub Stock Voices, first make sure you’ve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You don’t need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all!","ljspeech-1", None, None, "ljspeech-1"], + ["To access Overdub Stock Voices, first make sure you’ve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You don’t need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all. To access Overdub Stock Voices, first make sure you’ve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You don’t need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all!", "ljspeech-2", None, None, "ljspeech-2"], ["Be a voice, not an echo.", "ljspeech-1", None, None, "ljspeech-1"], ], ) @@ -45,9 +48,13 @@ config.model_args.d_vector_dim = 256 config.model_args.use_external_emotions_embeddings = True config.model_args.use_emotion_embedding = False config.model_args.emotion_embedding_dim = 256 -config.model_args.emotion_just_encoder = True config.model_args.external_emotions_embs_file = "tests/data/ljspeech/speakers.json" +config.model_args.condition_dp_on_speaker = False config.use_style_weighted_sampler = True + +config.mixed_precision = True +config.cudnn_benchmark = True + # consistency loss # config.model_args.use_emotion_encoder_as_loss = True # config.model_args.encoder_model_path = "/raid/edresson/dev/Checkpoints/Coqui-Realesead/tts_models--multilingual--multi-dataset--your_tts/model_se.pth.tar"