Bug fix on external embeddings training

This commit is contained in:
Edresson Casanova 2022-07-04 14:45:46 -03:00
parent 65222b1f8c
commit f9199b04c4
2 changed files with 33 additions and 21 deletions

View File

@ -1,5 +1,6 @@
import math import math
import os import os
import traceback
import numpy as np import numpy as np
import pyworld as pw import pyworld as pw
from dataclasses import dataclass, field, replace from dataclasses import dataclass, field, replace
@ -312,7 +313,7 @@ class VitsDataset(TTSDataset):
"wav_file": wav_filename, "wav_file": wav_filename,
"speaker_name": item["speaker_name"], "speaker_name": item["speaker_name"],
"language_name": item["language"], "language_name": item["language"],
"emotion_name": item["emotion_name"], "emotion_name": item["emotion_name"] if "emotion_name" in item else None,
"pitch": f0, "pitch": f0,
"alignments": alignments, "alignments": alignments,
@ -1857,7 +1858,6 @@ class Vits(BaseTTS):
Returns: Returns:
Tuple[Dict, Dict]: Model ouputs and computed losses. Tuple[Dict, Dict]: Model ouputs and computed losses.
""" """
self._freeze_layers() self._freeze_layers()
spec_lens = batch["spec_lens"] spec_lens = batch["spec_lens"]
@ -2163,24 +2163,29 @@ class Vits(BaseTTS):
test_sentences = self.config.test_sentences test_sentences = self.config.test_sentences
for idx, s_info in enumerate(test_sentences): for idx, s_info in enumerate(test_sentences):
aux_inputs = self.get_aux_input_from_test_sentences(s_info) aux_inputs = self.get_aux_input_from_test_sentences(s_info)
wav, alignment, _, _ = synthesis( try:
self, wav, alignment, _, _ = synthesis(
aux_inputs["text"], self,
self.config, aux_inputs["text"],
"cuda" in str(next(self.parameters()).device), self.config,
speaker_id=aux_inputs["speaker_id"], "cuda" in str(next(self.parameters()).device),
d_vector=aux_inputs["d_vector"], speaker_id=aux_inputs["speaker_id"],
style_wav=aux_inputs["style_wav"], d_vector=aux_inputs["d_vector"],
language_id=aux_inputs["language_id"], style_wav=aux_inputs["style_wav"],
emotion_embedding=aux_inputs["emotion_embedding"], language_id=aux_inputs["language_id"],
emotion_id=aux_inputs["emotion_ids"], emotion_embedding=aux_inputs["emotion_embedding"],
style_speaker_id=aux_inputs["style_speaker_id"], emotion_id=aux_inputs["emotion_ids"],
style_speaker_d_vector=aux_inputs["style_speaker_d_vector"], style_speaker_id=aux_inputs["style_speaker_id"],
use_griffin_lim=True, style_speaker_d_vector=aux_inputs["style_speaker_d_vector"],
do_trim_silence=False, use_griffin_lim=True,
).values() do_trim_silence=False,
test_audios["{}-audio".format(idx)] = wav ).values()
test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False) test_audios["{}-audio".format(idx)] = wav
test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)
except:
print("Error during the synthesis of the sentence:", aux_inputs)
traceback.print_exc()
return {"figures": test_figures, "audios": test_audios} return {"figures": test_figures, "audios": test_audios}
def test_log( def test_log(

View File

@ -26,6 +26,9 @@ config = VitsConfig(
print_step=1, print_step=1,
print_eval=True, print_eval=True,
test_sentences=[ test_sentences=[
["There", "ljspeech-1", None, None, "ljspeech-1"],
["To access Overdub Stock Voices, first make sure youve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You dont need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all!","ljspeech-1", None, None, "ljspeech-1"],
["To access Overdub Stock Voices, first make sure youve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You dont need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all. To access Overdub Stock Voices, first make sure youve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You dont need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all!", "ljspeech-2", None, None, "ljspeech-2"],
["Be a voice, not an echo.", "ljspeech-1", None, None, "ljspeech-1"], ["Be a voice, not an echo.", "ljspeech-1", None, None, "ljspeech-1"],
], ],
) )
@ -45,9 +48,13 @@ config.model_args.d_vector_dim = 256
config.model_args.use_external_emotions_embeddings = True config.model_args.use_external_emotions_embeddings = True
config.model_args.use_emotion_embedding = False config.model_args.use_emotion_embedding = False
config.model_args.emotion_embedding_dim = 256 config.model_args.emotion_embedding_dim = 256
config.model_args.emotion_just_encoder = True
config.model_args.external_emotions_embs_file = "tests/data/ljspeech/speakers.json" config.model_args.external_emotions_embs_file = "tests/data/ljspeech/speakers.json"
config.model_args.condition_dp_on_speaker = False
config.use_style_weighted_sampler = True config.use_style_weighted_sampler = True
config.mixed_precision = True
config.cudnn_benchmark = True
# consistency loss # consistency loss
# config.model_args.use_emotion_encoder_as_loss = True # config.model_args.use_emotion_encoder_as_loss = True
# config.model_args.encoder_model_path = "/raid/edresson/dev/Checkpoints/Coqui-Realesead/tts_models--multilingual--multi-dataset--your_tts/model_se.pth.tar" # config.model_args.encoder_model_path = "/raid/edresson/dev/Checkpoints/Coqui-Realesead/tts_models--multilingual--multi-dataset--your_tts/model_se.pth.tar"