mirror of https://github.com/coqui-ai/TTS.git
Adding pre-trained Overflow model (#2211)
* Adding pretrained Overflow model * Stabilize HMM * Fixup model manager * Return `audio_unique_name` by default * Distribute max split size over datasets * Fixup eval_split_size * Make style
This commit is contained in:
parent
061ac43187
commit
ecea43ec81
|
@ -150,6 +150,15 @@
|
|||
"author": "Eren Gölge @erogol",
|
||||
"license": "apache 2.0",
|
||||
"contact": "egolge@coqui.com"
|
||||
},
|
||||
"overflow": {
|
||||
"description": "Overflow model trained on LJSpeech",
|
||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
|
||||
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
||||
"commit": "3b1a28f",
|
||||
"author": "Eren Gölge @erogol",
|
||||
"license": "apache 2.0",
|
||||
"contact": "egolge@coqui.ai"
|
||||
}
|
||||
},
|
||||
"vctk": {
|
||||
|
|
|
@ -129,7 +129,8 @@ def load_tts_samples(
|
|||
meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
|
||||
meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
|
||||
else:
|
||||
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
|
||||
eval_size_per_dataset = eval_split_max_size // len(datasets) if eval_split_max_size else None
|
||||
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_size_per_dataset, eval_split_size)
|
||||
meta_data_eval_all += meta_data_eval
|
||||
meta_data_train_all += meta_data_train
|
||||
# load attention masks for the duration predictor training
|
||||
|
|
|
@ -520,6 +520,7 @@ class TTSDataset(Dataset):
|
|||
"raw_text": batch["raw_text"],
|
||||
"pitch": pitch,
|
||||
"language_ids": language_ids,
|
||||
"audio_unique_names": batch["audio_unique_name"],
|
||||
}
|
||||
|
||||
raise TypeError(
|
||||
|
|
|
@ -311,7 +311,7 @@ class NeuralHMM(nn.Module):
|
|||
|
||||
# If the length of the mel is less than the number of states it will select the -inf values leading to nan gradients
|
||||
# Ideally, we should clean the dataset otherwise this is a little hack uncomment the line below
|
||||
# final_log_c = final_log_c.clamp(min=torch.finfo(final_log_c.dtype).min)
|
||||
final_log_c = final_log_c.clamp(min=torch.finfo(final_log_c.dtype).min)
|
||||
|
||||
sum_final_log_c = torch.logsumexp(final_log_c, dim=1)
|
||||
return sum_final_log_c
|
||||
|
|
|
@ -232,6 +232,7 @@ class BaseTTS(BaseTrainerModel):
|
|||
"waveform": waveform,
|
||||
"pitch": pitch,
|
||||
"language_ids": language_ids,
|
||||
"audio_unique_names": batch["audio_unique_names"],
|
||||
}
|
||||
|
||||
def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
|
||||
|
@ -388,6 +389,9 @@ class BaseTTS(BaseTrainerModel):
|
|||
test_sentences = self.config.test_sentences
|
||||
aux_inputs = self._get_test_aux_input()
|
||||
for idx, sen in enumerate(test_sentences):
|
||||
if isinstance(sen, list):
|
||||
aux_inputs = self.get_aux_input_from_test_sentences(sen)
|
||||
sen = aux_inputs["text"]
|
||||
outputs_dict = synthesis(
|
||||
self,
|
||||
sen,
|
||||
|
|
|
@ -366,6 +366,7 @@ class ModelManager(object):
|
|||
for file_path in z.namelist()[1:]:
|
||||
src_path = os.path.join(output_folder, file_path)
|
||||
dst_path = os.path.join(output_folder, os.path.basename(file_path))
|
||||
if src_path != dst_path:
|
||||
copyfile(src_path, dst_path)
|
||||
# remove the extracted folder
|
||||
rmtree(os.path.join(output_folder, z.namelist()[0]))
|
||||
|
|
Loading…
Reference in New Issue