Adding pre-trained Overflow model (#2211)

* Adding pretrained Overflow model * Stabilize HMM * Fixup model manager * Return `audio_unique_name` by default * Distribute max split size over datasets * Fixup eval_split_size * Make style
2022-12-14 16:55:48 +01:00 · 2022-12-14 16:55:48 +01:00 · ecea43ec81
parent 061ac43187
commit ecea43ec81
6 changed files with 21 additions and 5 deletions
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -141,7 +141,7 @@
                    "license": "bsd-3-clause",
                    "contact": null,
                    "commit": null
-                },   
+                },
                "fast_pitch": {
                    "description": "FastPitch model trained on LJSpeech using the Aligner Network",
                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
@ -150,6 +150,15 @@
                    "author": "Eren Gölge @erogol",
                    "license": "apache 2.0",
                    "contact": "egolge@coqui.com"
+                },
+                "overflow": {
+                    "description": "Overflow model trained on LJSpeech",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
+                    "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
+                    "commit": "3b1a28f",
+                    "author": "Eren Gölge @erogol",
+                    "license": "apache 2.0",
+                    "contact": "egolge@coqui.ai"
                }
            },
            "vctk": {
@ -223,7 +232,7 @@
                    "author": "@NeonGeckoCom",
                    "license": "bsd-3-clause"
                }
-            } 
+            }
        },
        "fr": {
            "mai": {
--- a/TTS/tts/datasets/init.py
+++ b/TTS/tts/datasets/init.py
@ -129,7 +129,8 @@ def load_tts_samples(
                meta_data_eval = formatter(root_path, meta_file_val, ignored_speakers=ignored_speakers)
                meta_data_eval = add_extra_keys(meta_data_eval, language, dataset_name)
            else:
-                meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_split_max_size, eval_split_size)
+                eval_size_per_dataset = eval_split_max_size // len(datasets) if eval_split_max_size else None
+                meta_data_eval, meta_data_train = split_dataset(meta_data_train, eval_size_per_dataset, eval_split_size)
            meta_data_eval_all += meta_data_eval
        meta_data_train_all += meta_data_train
        # load attention masks for the duration predictor training
--- a/TTS/tts/datasets/dataset.py
+++ b/TTS/tts/datasets/dataset.py
@ -520,6 +520,7 @@ class TTSDataset(Dataset):
                "raw_text": batch["raw_text"],
                "pitch": pitch,
                "language_ids": language_ids,
+                "audio_unique_names": batch["audio_unique_name"],
            }

        raise TypeError(
--- a/TTS/tts/layers/overflow/neural_hmm.py
+++ b/TTS/tts/layers/overflow/neural_hmm.py
@ -311,7 +311,7 @@ class NeuralHMM(nn.Module):

        # If the length of the mel is less than the number of states it will select the -inf values leading to nan gradients
        # Ideally, we should clean the dataset otherwise this is a little hack uncomment the line below
-        # final_log_c = final_log_c.clamp(min=torch.finfo(final_log_c.dtype).min)
+        final_log_c = final_log_c.clamp(min=torch.finfo(final_log_c.dtype).min)

        sum_final_log_c = torch.logsumexp(final_log_c, dim=1)
        return sum_final_log_c
--- a/TTS/tts/models/base_tts.py
+++ b/TTS/tts/models/base_tts.py
@ -232,6 +232,7 @@ class BaseTTS(BaseTrainerModel):
            "waveform": waveform,
            "pitch": pitch,
            "language_ids": language_ids,
+            "audio_unique_names": batch["audio_unique_names"],
        }

    def get_sampler(self, config: Coqpit, dataset: TTSDataset, num_gpus=1):
@ -388,6 +389,9 @@ class BaseTTS(BaseTrainerModel):
        test_sentences = self.config.test_sentences
        aux_inputs = self._get_test_aux_input()
        for idx, sen in enumerate(test_sentences):
+            if isinstance(sen, list):
+                aux_inputs = self.get_aux_input_from_test_sentences(sen)
+                sen = aux_inputs["text"]
            outputs_dict = synthesis(
                self,
                sen,
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -366,7 +366,8 @@ class ModelManager(object):
        for file_path in z.namelist()[1:]:
            src_path = os.path.join(output_folder, file_path)
            dst_path = os.path.join(output_folder, os.path.basename(file_path))
-            copyfile(src_path, dst_path)
+            if src_path != dst_path:
+                copyfile(src_path, dst_path)
        # remove the extracted folder
        rmtree(os.path.join(output_folder, z.namelist()[0]))