mirror of https://github.com/coqui-ai/TTS.git
Fix rebase issues
This commit is contained in:
parent
1a88191a5a
commit
749b217884
|
@ -37,7 +37,9 @@ parser.add_argument(
|
|||
)
|
||||
parser.add_argument("--use_cuda", type=bool, help="flag to set cuda. Default False", default=False)
|
||||
parser.add_argument("--no_eval", type=bool, help="Do not compute eval?. Default False", default=False)
|
||||
parser.add_argument("--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False)
|
||||
parser.add_argument(
|
||||
"--use_predicted_label", type=bool, help="If True and predicted label is available with will use it.", default=False
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
|
|
@ -178,14 +178,10 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
|||
help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
|
||||
<<<<<<< HEAD
|
||||
parser.add_argument("--style_wav", type=str, help="Wav path file for prosody reference.", default=None)
|
||||
parser.add_argument(
|
||||
"--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
|
||||
"--capacitron_style_text", type=str, help="Transcription of the style_wav reference.", default=None
|
||||
)
|
||||
parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
|
||||
=======
|
||||
>>>>>>> 3a524b05... Add prosody encoder params on config
|
||||
parser.add_argument(
|
||||
"--list_speaker_idxs",
|
||||
help="List available speaker ids for the defined multi-speaker model.",
|
||||
|
@ -324,9 +320,8 @@ If you don't specify any models, then it uses LJSpeech based English model.
|
|||
args.speaker_idx,
|
||||
args.language_idx,
|
||||
args.speaker_wav,
|
||||
style_wav=args.gst_style,
|
||||
reference_wav=args.reference_wav,
|
||||
style_wav=args.capacitron_style_wav,
|
||||
style_wav=args.style_wav,
|
||||
style_text=args.capacitron_style_text,
|
||||
reference_speaker_name=args.reference_speaker_idx,
|
||||
emotion_name=args.emotion_idx,
|
||||
|
|
|
@ -12,16 +12,16 @@ from TTS.tts.datasets.formatters import *
|
|||
def split_dataset(items, eval_split_max_size=None, eval_split_size=0.01):
|
||||
"""Split a dataset into train and eval. Consider speaker distribution in multi-speaker training.
|
||||
|
||||
Args:
|
||||
items (List[List]):
|
||||
A list of samples. Each sample is a dict containing the keys "text", "audio_file", and "speaker_name".
|
||||
Args:
|
||||
items (List[List]):
|
||||
A list of samples. Each sample is a dict containing the keys "text", "audio_file", and "speaker_name".
|
||||
|
||||
eval_split_max_size (int):
|
||||
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
|
||||
eval_split_max_size (int):
|
||||
Number maximum of samples to be used for evaluation in proportion split. Defaults to None (Disabled).
|
||||
|
||||
eval_split_size (float):
|
||||
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
|
||||
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
|
||||
eval_split_size (float):
|
||||
If between 0.0 and 1.0 represents the proportion of the dataset to include in the evaluation set.
|
||||
If > 1, represents the absolute number of evaluation samples. Defaults to 0.01 (1%).
|
||||
"""
|
||||
speakers = [item["speaker_name"] for item in items]
|
||||
is_multi_speaker = len(set(speakers)) > 1
|
||||
|
|
|
@ -700,7 +700,9 @@ class Vits(BaseTTS):
|
|||
|
||||
if self.args.use_text_enc_spk_reversal_classifier:
|
||||
self.speaker_text_enc_reversal_classifier = ReversalClassifier(
|
||||
in_channels=self.args.hidden_channels + self.args.emotion_embedding_dim + self.args.prosody_embedding_dim,
|
||||
in_channels=self.args.hidden_channels
|
||||
+ self.args.emotion_embedding_dim
|
||||
+ self.args.prosody_embedding_dim,
|
||||
out_channels=self.num_speakers,
|
||||
hidden_channels=256,
|
||||
)
|
||||
|
@ -1483,7 +1485,7 @@ class Vits(BaseTTS):
|
|||
gt_cons_emb=self.model_outputs_cache["gt_cons_emb"],
|
||||
syn_cons_emb=self.model_outputs_cache["syn_cons_emb"],
|
||||
loss_prosody_enc_spk_rev_classifier=self.model_outputs_cache["loss_prosody_enc_spk_rev_classifier"],
|
||||
loss_text_enc_spk_rev_classifier=self.model_outputs_cache["loss_text_enc_spk_rev_classifier"]
|
||||
loss_text_enc_spk_rev_classifier=self.model_outputs_cache["loss_text_enc_spk_rev_classifier"],
|
||||
)
|
||||
|
||||
return self.model_outputs_cache, loss_dict
|
||||
|
@ -1654,7 +1656,11 @@ class Vits(BaseTTS):
|
|||
if (
|
||||
self.speaker_manager is not None
|
||||
and self.speaker_manager.ids
|
||||
and (self.args.use_speaker_embedding or self.args.use_prosody_encoder or self.args.use_text_enc_spk_reversal_classifier)
|
||||
and (
|
||||
self.args.use_speaker_embedding
|
||||
or self.args.use_prosody_encoder
|
||||
or self.args.use_text_enc_spk_reversal_classifier
|
||||
)
|
||||
):
|
||||
speaker_ids = [self.speaker_manager.ids[sn] for sn in batch["speaker_names"]]
|
||||
|
||||
|
|
|
@ -181,7 +181,7 @@ def synthesis(
|
|||
style_feature = compute_style_feature(style_wav, model.ap, cuda=use_cuda)
|
||||
style_feature = style_feature.transpose(1, 2) # [1, time, depth]
|
||||
|
||||
if hasattr(model, 'compute_style_feature') and style_wav is not None:
|
||||
if hasattr(model, "compute_style_feature") and style_wav is not None:
|
||||
style_feature = model.compute_style_feature(style_wav)
|
||||
|
||||
# convert text to sequence of token IDs
|
||||
|
|
|
@ -218,8 +218,8 @@ class ModelManager(object):
|
|||
output_stats_path = os.path.join(output_path, "scale_stats.npy")
|
||||
output_d_vector_file_path = os.path.join(output_path, "speakers.json")
|
||||
output_speaker_ids_file_path = os.path.join(output_path, "speaker_ids.json")
|
||||
speaker_encoder_config_path = os.path.join(output_path, "config_se.json")
|
||||
speaker_encoder_model_path = self._find_speaker_encoder(output_path)
|
||||
encoder_config_path = os.path.join(output_path, "config_se.json")
|
||||
encoder_model_path = self._find_speaker_encoder(output_path)
|
||||
|
||||
# update the scale_path.npy file path in the model config.json
|
||||
self._update_path("audio.stats_path", output_stats_path, config_path)
|
||||
|
|
|
@ -76,8 +76,8 @@ continue_restore_path, _ = get_last_checkpoint(continue_path)
|
|||
out_wav_path = os.path.join(get_tests_output_path(), "output.wav")
|
||||
speaker_id = "ljspeech-1"
|
||||
emotion_id = "ljspeech-3"
|
||||
continue_speakers_path = os.path.join(continue_path, "speakers.json")
|
||||
continue_emotion_path = os.path.join(continue_path, "speakers.json")
|
||||
continue_speakers_path = config.d_vector_file
|
||||
continue_emotion_path = os.path.join(continue_path, "emotions.json")
|
||||
|
||||
|
||||
inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --emotion_idx {emotion_id} --speakers_file_path {continue_speakers_path} --emotions_file_path {continue_emotion_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path}"
|
||||
|
|
|
@ -71,7 +71,7 @@ style_wav_path = "tests/data/ljspeech/wavs/LJ001-0001.wav"
|
|||
continue_speakers_path = os.path.join(continue_path, "speakers.json")
|
||||
|
||||
|
||||
inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --gst_style {style_wav_path}"
|
||||
inference_command = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' tts --text 'This is an example.' --speaker_idx {speaker_id} --speakers_file_path {continue_speakers_path} --config_path {continue_config_path} --model_path {continue_restore_path} --out_path {out_wav_path} --style_wav {style_wav_path}"
|
||||
run_cli(inference_command)
|
||||
|
||||
# restore the model and continue training for one more epoch
|
||||
|
|
Loading…
Reference in New Issue