Adding pre-trained Overflow model (#2211)

* Adding pretrained Overflow model

* Stabilize HMM

* Fixup model manager

* Return `audio_unique_name` by default

* Distribute max split size over datasets

* Fixup eval_split_size

* Make style
This commit is contained in:
Eren Gölge 2022-12-14 16:55:48 +01:00 committed by GitHub
parent 061ac43187
commit ecea43ec81
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 21 additions and 5 deletions

View File

@ -141,7 +141,7 @@
"license": "bsd-3-clause",
"contact": null,
"commit": null
},
},
"fast_pitch": {
"description": "FastPitch model trained on LJSpeech using the Aligner Network",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
@ -150,6 +150,15 @@
"author": "Eren Gölge @erogol",
"license": "apache 2.0",
"contact": "egolge@coqui.com"
},
"overflow": {
"description": "Overflow model trained on LJSpeech",
"github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
"default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
"commit": "3b1a28f",
"author": "Eren Gölge @erogol",
"license": "apache 2.0",
"contact": "egolge@coqui.ai"
}
},
"vctk": {
@ -223,7 +232,7 @@
"author": "@NeonGeckoCom",
"license": "bsd-3-clause"
}
}
}
},
"fr": {
"mai": {

View File

@ -129,7 +129,8 @@ def load_tts_samples(
meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
else:
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
eval_size_per_dataset = eval_split_max_size // len(datasets) if eval_split_max_size else None
meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_size_per_dataset, eval_split_size)
meta_data_eval_all += meta_data_eval
meta_data_train_all += meta_data_train
# load attention masks for the duration predictor training

View File

@ -520,6 +520,7 @@ class TTSDataset(Dataset):
"raw_text": batch["raw_text"],
"pitch": pitch,
"language_ids": language_ids,
"audio_unique_names": batch["audio_unique_name"],
}
raise TypeError(

View File

@ -311,7 +311,7 @@ class NeuralHMM(nn.Module):
# If the length of the mel is less than the number of states it will select the -inf values leading to nan gradients
# Ideally, we should clean the dataset otherwise this is a little hack uncomment the line below
# final_log_c = final_log_c.clamp(min=torch.finfo(final_log_c.dtype).min)
final_log_c = final_log_c.clamp(min=torch.finfo(final_log_c.dtype).min)
sum_final_log_c = torch.logsumexp(final_log_c, dim=1)
return sum_final_log_c

View File

@ -232,6 +232,7 @@ class BaseTTS(BaseTrainerModel):
"waveform": waveform,
"pitch": pitch,
"language_ids": language_ids,
"audio_unique_names": batch["audio_unique_names"],
}
def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
@ -388,6 +389,9 @@ class BaseTTS(BaseTrainerModel):
test_sentences = self.config.test_sentences
aux_inputs = self._get_test_aux_input()
for idx, sen in enumerate(test_sentences):
if isinstance(sen, list):
aux_inputs = self.get_aux_input_from_test_sentences(sen)
sen = aux_inputs["text"]
outputs_dict = synthesis(
self,
sen,

View File

@ -366,7 +366,8 @@ class ModelManager(object):
for file_path in z.namelist()[1:]:
src_path = os.path.join(output_folder, file_path)
dst_path = os.path.join(output_folder, os.path.basename(file_path))
copyfile(src_path, dst_path)
if src_path != dst_path:
copyfile(src_path, dst_path)
# remove the extracted folder
rmtree(os.path.join(output_folder, z.namelist()[0]))