mirror of https://github.com/coqui-ai/TTS.git
Adding pre-trained Overflow model (#2211)
* Adding pretrained Overflow model * Stabilize HMM * Fixup model manager * Return `audio_unique_name` by default * Distribute max split size over datasets * Fixup eval_split_size * Make style
This commit is contained in:
parent
061ac43187
commit
ecea43ec81
|
@ -141,7 +141,7 @@
|
||||||
"license": "bsd-3-clause",
|
"license": "bsd-3-clause",
|
||||||
"contact": null,
|
"contact": null,
|
||||||
"commit": null
|
"commit": null
|
||||||
},
|
},
|
||||||
"fast_pitch": {
|
"fast_pitch": {
|
||||||
"description": "FastPitch model trained on LJSpeech using the Aligner Network",
|
"description": "FastPitch model trained on LJSpeech using the Aligner Network",
|
||||||
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
|
||||||
|
@ -150,6 +150,15 @@
|
||||||
"author": "Eren Gölge @erogol",
|
"author": "Eren Gölge @erogol",
|
||||||
"license": "apache 2.0",
|
"license": "apache 2.0",
|
||||||
"contact": "egolge@coqui.com"
|
"contact": "egolge@coqui.com"
|
||||||
|
},
|
||||||
|
"overflow": {
|
||||||
|
"description": "Overflow model trained on LJSpeech",
|
||||||
|
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
|
||||||
|
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
|
||||||
|
"commit": "3b1a28f",
|
||||||
|
"author": "Eren Gölge @erogol",
|
||||||
|
"license": "apache 2.0",
|
||||||
|
"contact": "egolge@coqui.ai"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"vctk": {
|
"vctk": {
|
||||||
|
@ -223,7 +232,7 @@
|
||||||
"author": "@NeonGeckoCom",
|
"author": "@NeonGeckoCom",
|
||||||
"license": "bsd-3-clause"
|
"license": "bsd-3-clause"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"fr": {
|
"fr": {
|
||||||
"mai": {
|
"mai": {
|
||||||
|
|
|
@ -129,7 +129,8 @@ def load_tts_samples(
|
||||||
meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
|
meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
|
||||||
meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
|
meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
|
||||||
else:
|
else:
|
||||||
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
|
eval_size_per_dataset = eval_split_max_size // len(datasets) if eval_split_max_size else None
|
||||||
|
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_size_per_dataset, eval_split_size)
|
||||||
meta_data_eval_all += meta_data_eval
|
meta_data_eval_all += meta_data_eval
|
||||||
meta_data_train_all += meta_data_train
|
meta_data_train_all += meta_data_train
|
||||||
# load attention masks for the duration predictor training
|
# load attention masks for the duration predictor training
|
||||||
|
|
|
@ -520,6 +520,7 @@ class TTSDataset(Dataset):
|
||||||
"raw_text": batch["raw_text"],
|
"raw_text": batch["raw_text"],
|
||||||
"pitch": pitch,
|
"pitch": pitch,
|
||||||
"language_ids": language_ids,
|
"language_ids": language_ids,
|
||||||
|
"audio_unique_names": batch["audio_unique_name"],
|
||||||
}
|
}
|
||||||
|
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
|
|
|
@ -311,7 +311,7 @@ class NeuralHMM(nn.Module):
|
||||||
|
|
||||||
# If the length of the mel is less than the number of states it will select the -inf values leading to nan gradients
|
# If the length of the mel is less than the number of states it will select the -inf values leading to nan gradients
|
||||||
# Ideally, we should clean the dataset otherwise this is a little hack uncomment the line below
|
# Ideally, we should clean the dataset otherwise this is a little hack uncomment the line below
|
||||||
# final_log_c = final_log_c.clamp(min=torch.finfo(final_log_c.dtype).min)
|
final_log_c = final_log_c.clamp(min=torch.finfo(final_log_c.dtype).min)
|
||||||
|
|
||||||
sum_final_log_c = torch.logsumexp(final_log_c, dim=1)
|
sum_final_log_c = torch.logsumexp(final_log_c, dim=1)
|
||||||
return sum_final_log_c
|
return sum_final_log_c
|
||||||
|
|
|
@ -232,6 +232,7 @@ class BaseTTS(BaseTrainerModel):
|
||||||
"waveform": waveform,
|
"waveform": waveform,
|
||||||
"pitch": pitch,
|
"pitch": pitch,
|
||||||
"language_ids": language_ids,
|
"language_ids": language_ids,
|
||||||
|
"audio_unique_names": batch["audio_unique_names"],
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
|
def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
|
||||||
|
@ -388,6 +389,9 @@ class BaseTTS(BaseTrainerModel):
|
||||||
test_sentences = self.config.test_sentences
|
test_sentences = self.config.test_sentences
|
||||||
aux_inputs = self._get_test_aux_input()
|
aux_inputs = self._get_test_aux_input()
|
||||||
for idx, sen in enumerate(test_sentences):
|
for idx, sen in enumerate(test_sentences):
|
||||||
|
if isinstance(sen, list):
|
||||||
|
aux_inputs = self.get_aux_input_from_test_sentences(sen)
|
||||||
|
sen = aux_inputs["text"]
|
||||||
outputs_dict = synthesis(
|
outputs_dict = synthesis(
|
||||||
self,
|
self,
|
||||||
sen,
|
sen,
|
||||||
|
|
|
@ -366,7 +366,8 @@ class ModelManager(object):
|
||||||
for file_path in z.namelist()[1:]:
|
for file_path in z.namelist()[1:]:
|
||||||
src_path = os.path.join(output_folder, file_path)
|
src_path = os.path.join(output_folder, file_path)
|
||||||
dst_path = os.path.join(output_folder, os.path.basename(file_path))
|
dst_path = os.path.join(output_folder, os.path.basename(file_path))
|
||||||
copyfile(src_path, dst_path)
|
if src_path != dst_path:
|
||||||
|
copyfile(src_path, dst_path)
|
||||||
# remove the extracted folder
|
# remove the extracted folder
|
||||||
rmtree(os.path.join(output_folder, z.namelist()[0]))
|
rmtree(os.path.join(output_folder, z.namelist()[0]))
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue