Fix rebase issues

This commit is contained in:
Edresson Casanova 2022-05-20 18:29:39 -03:00
parent 98c2834b17
commit 0fb1b200c6
8 changed files with 29 additions and 26 deletions

View File

@ -25,7 +25,9 @@ parser.add_argument("--output_path", type=str, help="Path for output `pth` or `j
parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
parser.add_argument("--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False)
parser.add_argument(
"--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False
)
args = parser.parse_args()

View File

@ -178,14 +178,10 @@ If you don't specify any models, then it uses LJSpeech based English model.
help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
default=None,
)
parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
<<<<<<< HEAD
parser.add_argument("--style_wav", type=str, help="Wav path file for prosody reference.", default=None)
parser.add_argument(
"--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
"--capacitron_style_text", type=str, help="Transcription of the style_wav reference.", default=None
)
parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
=======
>>>>>>> 3a524b05... Add prosody encoder params on config
parser.add_argument(
"--list_speaker_idxs",
help="List available speaker ids for the defined multi-speaker model.",
@ -324,9 +320,8 @@ If you don't specify any models, then it uses LJSpeech based English model.
args.speaker_idx,
args.language_idx,
args.speaker_wav,
style_wav=args.gst_style,
reference_wav=args.reference_wav,
style_wav=args.capacitron_style_wav,
style_wav=args.style_wav,
style_text=args.capacitron_style_text,
reference_speaker_name=args.reference_speaker_idx,
emotion_name=args.emotion_idx,

View File

@ -12,16 +12,16 @@ from TTS.tts.datasets.formatters import *
def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
"""Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
Args:
items (List[List]):
A list of samples. Each sample is a dict containing the keys "text", "audio_file", and "speaker_name".
Args:
items (List[List]):
A list of samples. Each sample is a dict containing the keys "text", "audio_file", and "speaker_name".
eval_split_max_size (int):
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
eval_split_max_size (int):
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
eval_split_size (float):
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
eval_split_size (float):
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
"""
speakers = [item["speaker_name"] for item in items]
is_multi_speaker = len(set(speakers)) > 1

View File

@ -700,7 +700,9 @@ class Vits(BaseTTS):
if self.args.use_text_enc_spk_reversal_classifier:
self.speaker_text_enc_reversal_classifier = ReversalClassifier(
in_channels=self.args.hidden_channels + self.args.emotion_embedding_dim + self.args.prosody_embedding_dim,
in_channels=self.args.hidden_channels
+ self.args.emotion_embedding_dim
+ self.args.prosody_embedding_dim,
out_channels=self.num_speakers,
hidden_channels=256,
)
@ -1483,7 +1485,7 @@ class Vits(BaseTTS):
gt_cons_emb=self.model_outputs_cache["gt_cons_emb"],
syn_cons_emb=self.model_outputs_cache["syn_cons_emb"],
loss_prosody_enc_spk_rev_classifier=self.model_outputs_cache["loss_prosody_enc_spk_rev_classifier"],
loss_text_enc_spk_rev_classifier=self.model_outputs_cache["loss_text_enc_spk_rev_classifier"]
loss_text_enc_spk_rev_classifier=self.model_outputs_cache["loss_text_enc_spk_rev_classifier"],
)
return self.model_outputs_cache, loss_dict
@ -1654,7 +1656,11 @@ class Vits(BaseTTS):
if (
self.speaker_manager is not None
and self.speaker_manager.ids
and (self.args.use_speaker_embedding or self.args.use_prosody_encoder or self.args.use_text_enc_spk_reversal_classifier)
and (
self.args.use_speaker_embedding
or self.args.use_prosody_encoder
or self.args.use_text_enc_spk_reversal_classifier
)
):
speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]]

View File

@ -181,7 +181,7 @@ def synthesis(
style_feature = compute_style_feature(style_wav, model.ap, cuda=use_cuda)
style_feature = style_feature.transpose(1, 2) # [1, time, depth]
if hasattr(model, 'compute_style_feature') and style_wav is not None:
if hasattr(model, "compute_style_feature") and style_wav is not None:
style_feature = model.compute_style_feature(style_wav)
# convert text to sequence of token IDs

View File

@ -218,8 +218,8 @@ class ModelManager(object):
output_stats_path = os.path.join(output_path, "scale_stats.npy")
output_d_vector_file_path = os.path.join(output_path, "speakers.json")
output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json")
speaker_encoder_config_path = os.path.join(output_path, "config_se.json")
speaker_encoder_model_path = self._find_speaker_encoder(output_path)
encoder_config_path = os.path.join(output_path, "config_se.json")
encoder_model_path = self._find_speaker_encoder(output_path)
# update the scale_path.npy file path in the model config.json
self._update_path("audio.stats_path", output_stats_path, config_path)

View File

@ -76,8 +76,8 @@ continue_restore_path, _ = get_last_checkpoint(continue_path)
out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
speaker_id = "ljspeech-1"
emotion_id = "ljspeech-3"
continue_speakers_path = os.path.join(continue_path, "speakers.json")
continue_emotion_path = os.path.join(continue_path, "speakers.json")
continue_speakers_path = config.d_vector_file
continue_emotion_path = os.path.join(continue_path, "emotions.json")
inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"

View File

@ -71,7 +71,7 @@ style_wav_path = "tests/data/ljspeech/wavs/LJ001-0001.wav"
continue_speakers_path = os.path.join(continue_path, "speakers.json")
inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --gst_style {style_wav_path}"
inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --style_wav {style_wav_path}"
run_cli(inference_command)
# restore the model and continue training for one more epoch