Fix rebase issues

This commit is contained in:
Edresson Casanova 2022-05-20 18:29:39 -03:00
parent 98c2834b17
commit 0fb1b200c6
8 changed files with 29 additions and 26 deletions

View File

@ -25,7 +25,9 @@ parser.add_argument("--output_path", type=str, help="Path for output `pth` or `j
parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None) parser.add_argument("--old_file", type=str, help="Previous embedding file to only compute new audios.", default=None)
parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False) parser.add_argument("--disable_cuda", type=bool, help="Flag to disable cuda.", default=False)
parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False) parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
parser.add_argument("--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False) parser.add_argument(
"--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False
)
args = parser.parse_args() args = parser.parse_args()

View File

@ -178,14 +178,10 @@ If you don't specify any models, then it uses LJSpeech based English model.
help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.", help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
default=None, default=None,
) )
parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None) parser.add_argument("--style_wav", type=str, help="Wav path file for prosody reference.", default=None)
<<<<<<< HEAD
parser.add_argument( parser.add_argument(
"--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None "--capacitron_style_text", type=str, help="Transcription of the style_wav reference.", default=None
) )
parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
=======
>>>>>>> 3a524b05... Add prosody encoder params on config
parser.add_argument( parser.add_argument(
"--list_speaker_idxs", "--list_speaker_idxs",
help="List available speaker ids for the defined multi-speaker model.", help="List available speaker ids for the defined multi-speaker model.",
@ -324,9 +320,8 @@ If you don't specify any models, then it uses LJSpeech based English model.
args.speaker_idx, args.speaker_idx,
args.language_idx, args.language_idx,
args.speaker_wav, args.speaker_wav,
style_wav=args.gst_style,
reference_wav=args.reference_wav, reference_wav=args.reference_wav,
style_wav=args.capacitron_style_wav, style_wav=args.style_wav,
style_text=args.capacitron_style_text, style_text=args.capacitron_style_text,
reference_speaker_name=args.reference_speaker_idx, reference_speaker_name=args.reference_speaker_idx,
emotion_name=args.emotion_idx, emotion_name=args.emotion_idx,

View File

@ -12,16 +12,16 @@ from TTS.tts.datasets.formatters import *
def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01): def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
"""Split a dataset into train and eval. Consider speaker distribution in multi-speaker training. """Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
Args: Args:
items (List[List]): items (List[List]):
A list of samples. Each sample is a dict containing the keys "text", "audio_file", and "speaker_name". A list of samples. Each sample is a dict containing the keys "text", "audio_file", and "speaker_name".
eval_split_max_size (int): eval_split_max_size (int):
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled). Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
eval_split_size (float): eval_split_size (float):
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set. If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%). If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
""" """
speakers = [item["speaker_name"] for item in items] speakers = [item["speaker_name"] for item in items]
is_multi_speaker = len(set(speakers)) > 1 is_multi_speaker = len(set(speakers)) > 1

View File

@ -700,7 +700,9 @@ class Vits(BaseTTS):
if self.args.use_text_enc_spk_reversal_classifier: if self.args.use_text_enc_spk_reversal_classifier:
self.speaker_text_enc_reversal_classifier = ReversalClassifier( self.speaker_text_enc_reversal_classifier = ReversalClassifier(
in_channels=self.args.hidden_channels + self.args.emotion_embedding_dim + self.args.prosody_embedding_dim, in_channels=self.args.hidden_channels
+ self.args.emotion_embedding_dim
+ self.args.prosody_embedding_dim,
out_channels=self.num_speakers, out_channels=self.num_speakers,
hidden_channels=256, hidden_channels=256,
) )
@ -1483,7 +1485,7 @@ class Vits(BaseTTS):
gt_cons_emb=self.model_outputs_cache["gt_cons_emb"], gt_cons_emb=self.model_outputs_cache["gt_cons_emb"],
syn_cons_emb=self.model_outputs_cache["syn_cons_emb"], syn_cons_emb=self.model_outputs_cache["syn_cons_emb"],
loss_prosody_enc_spk_rev_classifier=self.model_outputs_cache["loss_prosody_enc_spk_rev_classifier"], loss_prosody_enc_spk_rev_classifier=self.model_outputs_cache["loss_prosody_enc_spk_rev_classifier"],
loss_text_enc_spk_rev_classifier=self.model_outputs_cache["loss_text_enc_spk_rev_classifier"] loss_text_enc_spk_rev_classifier=self.model_outputs_cache["loss_text_enc_spk_rev_classifier"],
) )
return self.model_outputs_cache, loss_dict return self.model_outputs_cache, loss_dict
@ -1654,7 +1656,11 @@ class Vits(BaseTTS):
if ( if (
self.speaker_manager is not None self.speaker_manager is not None
and self.speaker_manager.ids and self.speaker_manager.ids
and (self.args.use_speaker_embedding or self.args.use_prosody_encoder or self.args.use_text_enc_spk_reversal_classifier) and (
self.args.use_speaker_embedding
or self.args.use_prosody_encoder
or self.args.use_text_enc_spk_reversal_classifier
)
): ):
speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]] speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]]

View File

@ -181,7 +181,7 @@ def synthesis(
style_feature = compute_style_feature(style_wav, model.ap, cuda=use_cuda) style_feature = compute_style_feature(style_wav, model.ap, cuda=use_cuda)
style_feature = style_feature.transpose(1, 2) # [1, time, depth] style_feature = style_feature.transpose(1, 2) # [1, time, depth]
if hasattr(model, 'compute_style_feature') and style_wav is not None: if hasattr(model, "compute_style_feature") and style_wav is not None:
style_feature = model.compute_style_feature(style_wav) style_feature = model.compute_style_feature(style_wav)
# convert text to sequence of token IDs # convert text to sequence of token IDs

View File

@ -218,8 +218,8 @@ class ModelManager(object):
output_stats_path = os.path.join(output_path, "scale_stats.npy") output_stats_path = os.path.join(output_path, "scale_stats.npy")
output_d_vector_file_path = os.path.join(output_path, "speakers.json") output_d_vector_file_path = os.path.join(output_path, "speakers.json")
output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json") output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json")
speaker_encoder_config_path = os.path.join(output_path, "config_se.json") encoder_config_path = os.path.join(output_path, "config_se.json")
speaker_encoder_model_path = self._find_speaker_encoder(output_path) encoder_model_path = self._find_speaker_encoder(output_path)
# update the scale_path.npy file path in the model config.json # update the scale_path.npy file path in the model config.json
self._update_path("audio.stats_path", output_stats_path, config_path) self._update_path("audio.stats_path", output_stats_path, config_path)

View File

@ -76,8 +76,8 @@ continue_restore_path, _ = get_last_checkpoint(continue_path)
out_wav_path = os.path.join(get_tests_output_path(), "output.wav") out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
speaker_id = "ljspeech-1" speaker_id = "ljspeech-1"
emotion_id = "ljspeech-3" emotion_id = "ljspeech-3"
continue_speakers_path = os.path.join(continue_path, "speakers.json") continue_speakers_path = config.d_vector_file
continue_emotion_path = os.path.join(continue_path, "speakers.json") continue_emotion_path = os.path.join(continue_path, "emotions.json")
inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}" inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"

View File

@ -71,7 +71,7 @@ style_wav_path = "tests/data/ljspeech/wavs/LJ001-0001.wav"
continue_speakers_path = os.path.join(continue_path, "speakers.json") continue_speakers_path = os.path.join(continue_path, "speakers.json")
inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --gst_style {style_wav_path}" inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --style_wav {style_wav_path}"
run_cli(inference_command) run_cli(inference_command)
# restore the model and continue training for one more epoch # restore the model and continue training for one more epoch