mirror of https://github.com/coqui-ai/TTS.git
Bug fix on external embeddings training
This commit is contained in:
parent
65222b1f8c
commit
f9199b04c4
|
@ -1,5 +1,6 @@
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
|
import traceback
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pyworld as pw
|
import pyworld as pw
|
||||||
from dataclasses import dataclass, field, replace
|
from dataclasses import dataclass, field, replace
|
||||||
|
@ -312,7 +313,7 @@ class VitsDataset(TTSDataset):
|
||||||
"wav_file": wav_filename,
|
"wav_file": wav_filename,
|
||||||
"speaker_name": item["speaker_name"],
|
"speaker_name": item["speaker_name"],
|
||||||
"language_name": item["language"],
|
"language_name": item["language"],
|
||||||
"emotion_name": item["emotion_name"],
|
"emotion_name": item["emotion_name"] if "emotion_name" in item else None,
|
||||||
"pitch": f0,
|
"pitch": f0,
|
||||||
"alignments": alignments,
|
"alignments": alignments,
|
||||||
|
|
||||||
|
@ -1857,7 +1858,6 @@ class Vits(BaseTTS):
|
||||||
Returns:
|
Returns:
|
||||||
Tuple[Dict, Dict]: Model ouputs and computed losses.
|
Tuple[Dict, Dict]: Model ouputs and computed losses.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
self._freeze_layers()
|
self._freeze_layers()
|
||||||
|
|
||||||
spec_lens = batch["spec_lens"]
|
spec_lens = batch["spec_lens"]
|
||||||
|
@ -2163,24 +2163,29 @@ class Vits(BaseTTS):
|
||||||
test_sentences = self.config.test_sentences
|
test_sentences = self.config.test_sentences
|
||||||
for idx, s_info in enumerate(test_sentences):
|
for idx, s_info in enumerate(test_sentences):
|
||||||
aux_inputs = self.get_aux_input_from_test_sentences(s_info)
|
aux_inputs = self.get_aux_input_from_test_sentences(s_info)
|
||||||
wav, alignment, _, _ = synthesis(
|
try:
|
||||||
self,
|
wav, alignment, _, _ = synthesis(
|
||||||
aux_inputs["text"],
|
self,
|
||||||
self.config,
|
aux_inputs["text"],
|
||||||
"cuda" in str(next(self.parameters()).device),
|
self.config,
|
||||||
speaker_id=aux_inputs["speaker_id"],
|
"cuda" in str(next(self.parameters()).device),
|
||||||
d_vector=aux_inputs["d_vector"],
|
speaker_id=aux_inputs["speaker_id"],
|
||||||
style_wav=aux_inputs["style_wav"],
|
d_vector=aux_inputs["d_vector"],
|
||||||
language_id=aux_inputs["language_id"],
|
style_wav=aux_inputs["style_wav"],
|
||||||
emotion_embedding=aux_inputs["emotion_embedding"],
|
language_id=aux_inputs["language_id"],
|
||||||
emotion_id=aux_inputs["emotion_ids"],
|
emotion_embedding=aux_inputs["emotion_embedding"],
|
||||||
style_speaker_id=aux_inputs["style_speaker_id"],
|
emotion_id=aux_inputs["emotion_ids"],
|
||||||
style_speaker_d_vector=aux_inputs["style_speaker_d_vector"],
|
style_speaker_id=aux_inputs["style_speaker_id"],
|
||||||
use_griffin_lim=True,
|
style_speaker_d_vector=aux_inputs["style_speaker_d_vector"],
|
||||||
do_trim_silence=False,
|
use_griffin_lim=True,
|
||||||
).values()
|
do_trim_silence=False,
|
||||||
test_audios["{}-audio".format(idx)] = wav
|
).values()
|
||||||
test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)
|
test_audios["{}-audio".format(idx)] = wav
|
||||||
|
test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)
|
||||||
|
except:
|
||||||
|
print("Error during the synthesis of the sentence:", aux_inputs)
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
return {"figures": test_figures, "audios": test_audios}
|
return {"figures": test_figures, "audios": test_audios}
|
||||||
|
|
||||||
def test_log(
|
def test_log(
|
||||||
|
|
|
@ -26,6 +26,9 @@ config = VitsConfig(
|
||||||
print_step=1,
|
print_step=1,
|
||||||
print_eval=True,
|
print_eval=True,
|
||||||
test_sentences=[
|
test_sentences=[
|
||||||
|
["There", "ljspeech-1", None, None, "ljspeech-1"],
|
||||||
|
["To access Overdub Stock Voices, first make sure you’ve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You don’t need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all!","ljspeech-1", None, None, "ljspeech-1"],
|
||||||
|
["To access Overdub Stock Voices, first make sure you’ve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You don’t need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all. To access Overdub Stock Voices, first make sure you’ve updated the app, then click Edit > Manage Speakers inside any Composition. Add a new Speaker with any name you choose, and then click Overdub Voice to select from our included Stock Voices. You don’t need to have created an Overdub Voice of your own to access these Stock Voices, so you can start using these voices in no time at all!", "ljspeech-2", None, None, "ljspeech-2"],
|
||||||
["Be a voice, not an echo.", "ljspeech-1", None, None, "ljspeech-1"],
|
["Be a voice, not an echo.", "ljspeech-1", None, None, "ljspeech-1"],
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -45,9 +48,13 @@ config.model_args.d_vector_dim = 256
|
||||||
config.model_args.use_external_emotions_embeddings = True
|
config.model_args.use_external_emotions_embeddings = True
|
||||||
config.model_args.use_emotion_embedding = False
|
config.model_args.use_emotion_embedding = False
|
||||||
config.model_args.emotion_embedding_dim = 256
|
config.model_args.emotion_embedding_dim = 256
|
||||||
config.model_args.emotion_just_encoder = True
|
|
||||||
config.model_args.external_emotions_embs_file = "tests/data/ljspeech/speakers.json"
|
config.model_args.external_emotions_embs_file = "tests/data/ljspeech/speakers.json"
|
||||||
|
config.model_args.condition_dp_on_speaker = False
|
||||||
config.use_style_weighted_sampler = True
|
config.use_style_weighted_sampler = True
|
||||||
|
|
||||||
|
config.mixed_precision = True
|
||||||
|
config.cudnn_benchmark = True
|
||||||
|
|
||||||
# consistency loss
|
# consistency loss
|
||||||
# config.model_args.use_emotion_encoder_as_loss = True
|
# config.model_args.use_emotion_encoder_as_loss = True
|
||||||
# config.model_args.encoder_model_path = "/raid/edresson/dev/Checkpoints/Coqui-Realesead/tts_models--multilingual--multi-dataset--your_tts/model_se.pth.tar"
|
# config.model_args.encoder_model_path = "/raid/edresson/dev/Checkpoints/Coqui-Realesead/tts_models--multilingual--multi-dataset--your_tts/model_se.pth.tar"
|
||||||
|
|
Loading…
Reference in New Issue