From 1b22f03e986134bcbcd2aba72fe8e226e07f5b9f Mon Sep 17 00:00:00 2001 From: WeberJulian Date: Wed, 30 Mar 2022 12:47:11 +0200 Subject: [PATCH] Fix G2P backend of the released models (#1461) * Fix enforce phonemizer * Add new models * Fix .model.json --- TTS/.models.json | 76 ++++++++++++++++++++-------------------- TTS/utils/synthesizer.py | 7 ++-- 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 801b8468..24838a5d 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -4,7 +4,7 @@ "multi-dataset":{ "your_tts":{ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--multilingual--multi-dataset--your_tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip", "default_vocoder": null, "commit": "e9a1953e", "license": "CC BY-NC-ND 4.0", @@ -16,7 +16,7 @@ "ek1": { "tacotron2": { "description": "EK1 en-rp tacotron2 by NMStoker", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.0/tts_models--en--ek1--tacotron2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip", "default_vocoder": "vocoder_models/en/ek1/wavegrad", "commit": "c802255" } @@ -24,7 +24,7 @@ "ljspeech": { "tacotron2-DDC": { "description": "Tacotron2 with Double Decoder Consistency.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/tts_models--en--ljspeech--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip", "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", "commit": "bae2ad0f", "author": "Eren Gölge @erogol", @@ -33,7 +33,7 @@ }, "tacotron2-DDC_ph": { "description": "Tacotron2 with Double Decoder Consistency with phonemes.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip", "default_vocoder": "vocoder_models/en/ljspeech/univnet", "commit": "3900448", "author": "Eren Gölge @erogol", @@ -42,7 +42,7 @@ }, "glow-tts": { "description": "", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip", "stats_file": null, "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", "commit": "", @@ -52,7 +52,7 @@ }, "speedy-speech": { "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/tts_models--en--ljspeech--speedy_speech.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip", "stats_file": null, "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", "commit": "4581e3d", @@ -62,7 +62,7 @@ }, "tacotron2-DCA": { "description": "", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--tacotron2-DCA.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip", "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan", "commit": "", "author": "Eren Gölge @erogol", @@ -71,7 +71,7 @@ }, "vits": { "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip", "default_vocoder": null, "commit": "3900448", "author": "Eren Gölge @erogol", @@ -80,7 +80,7 @@ }, "fast_pitch": { "description": "FastPitch model trained on LJSpeech using the Aligner Network", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.2/tts_models--en--ljspeech--fast_pitch.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip", "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2", "commit": "b27b3ba", "author": "Eren Gölge @erogol", @@ -91,7 +91,7 @@ "vctk": { "vits": { "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip", "default_vocoder": null, "commit": "3900448", "author": "Eren @erogol", @@ -100,7 +100,7 @@ }, "fast_pitch":{ "description": "FastPitch model trained on VCTK dataseset.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--fast_pitch.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip", "default_vocoder": null, "commit": "bdab788d", "author": "Eren @erogol", @@ -111,7 +111,7 @@ "sam": { "tacotron-DDC": { "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/tts_models--en--sam--tacotron_DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip", "default_vocoder": "vocoder_models/en/sam/hifigan_v2", "commit": "bae2ad0f", "author": "Eren Gölge @erogol", @@ -123,7 +123,7 @@ "es": { "mai": { "tacotron2-DDC": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--es--mai--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip", "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", "commit": "", "author": "Eren Gölge @erogol", @@ -135,7 +135,7 @@ "fr": { "mai": { "tacotron2-DDC": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--fr--mai--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip", "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan", "commit": "", "author": "Eren Gölge @erogol", @@ -147,7 +147,7 @@ "uk":{ "mai": { "glow-tts": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--uk--mai--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip", "author":"@robinhad", "commit": "bdab788d", "license": "MIT", @@ -159,7 +159,7 @@ "zh-CN": { "baker": { "tacotron2-DDC-GST": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip", "commit": "unknown", "author": "@kirianguiller", "default_vocoder": null @@ -169,7 +169,7 @@ "nl": { "mai": { "tacotron2-DDC": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/tts_models--nl--mai--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip", "author": "@r-dh", "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan", "stats_file": null, @@ -180,7 +180,7 @@ "de": { "thorsten": { "tacotron2-DCA": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip", "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan", "author": "@thorstenMueller", "commit": "unknown" @@ -190,7 +190,7 @@ "ja": { "kokoro": { "tacotron2-DDC": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip", "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1", "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.", "author": "@kaiidams", @@ -201,7 +201,7 @@ "tr":{ "common-voice": { "glow-tts":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--tr--common-voice--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip", "default_vocoder": "vocoder_models/tr/common-voice/hifigan", "license": "MIT", "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.", @@ -213,14 +213,14 @@ "it": { "mai_female": { "glow-tts":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", "commit": null }, "vits":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", @@ -229,14 +229,14 @@ }, "mai_male": { "glow-tts":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", "commit": null }, "vits":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip", "default_vocoder": null, "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.", "author": "@nicolalandro", @@ -249,14 +249,14 @@ "universal": { "libri-tts": { "wavegrad": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--wavegrad.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip", "commit": "ea976b0", "author": "Eren Gölge @erogol", "license": "MPL", "contact": "egolge@coqui.com" }, "fullband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--fullband-melgan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip", "commit": "4132240", "author": "Eren Gölge @erogol", "license": "MPL", @@ -268,13 +268,13 @@ "ek1": { "wavegrad": { "description": "EK1 en-rp wavegrad by NMStoker", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--en--ek1--wavegrad.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip", "commit": "c802255" } }, "ljspeech": { "multiband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--en--ljspeech--mulitband-melgan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip", "commit": "ea976b0", "author": "Eren Gölge @erogol", "license": "MPL", @@ -282,7 +282,7 @@ }, "hifigan_v2": { "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--ljspeech-hifigan_v2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip", "commit": "bae2ad0f", "author": "@erogol", "license": "", @@ -290,7 +290,7 @@ }, "univnet": { "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/vocoder_models--en--ljspeech--univnet_v2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip", "commit": "4581e3d", "author": "Eren @erogol", "license": "TBD", @@ -300,7 +300,7 @@ "vctk": { "hifigan_v2": { "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--vctk--hifigan_v2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip", "commit": "2f07160", "author": "Edresson Casanova", "license": "", @@ -310,7 +310,7 @@ "sam": { "hifigan_v2": { "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/vocoder_models--en--sam--hifigan_v2.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip", "commit": "2f07160", "author": "Eren Gölge @erogol", "license": "", @@ -321,7 +321,7 @@ "nl": { "mai": { "parallel-wavegan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--nl--mai--parallel-wavegan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip", "author": "@r-dh", "commit": "unknown" } @@ -330,12 +330,12 @@ "de": { "thorsten": { "wavegrad": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip", "author": "@thorstenMueller", "commit": "unknown" }, "fullband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip", "author": "@thorstenMueller", "commit": "unknown" } @@ -344,7 +344,7 @@ "ja": { "kokoro": { "hifigan_v1": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/vocoder_models--ja--kokoro--hifigan_v1.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip", "description": "HifiGAN model trained for kokoro dataset by @kaiidams", "author": "@kaiidams", "commit": "3900448" @@ -354,7 +354,7 @@ "uk": { "mai": { "multiband-melgan": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.5.0_models/vocoder_models--uk--mai--multiband-melgan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip", "author":"@robinhad", "commit": "bdab788d", "license": "MIT", @@ -365,7 +365,7 @@ "tr":{ "common-voice": { "hifigan":{ - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/vocoder_models--tr--common-voice--hifigan.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip", "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.", "author": "Fatih Akademi", "license": "MIT", diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index 3dd8be44..eef4086c 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -109,12 +109,11 @@ class Synthesizer(object): """ # pylint: disable=global-statement self.tts_config = load_config(tts_config_path) - self.use_phonemes = self.tts_config.use_phonemes - self.tts_model = setup_tts_model(config=self.tts_config) - - if self.use_phonemes and self.tts_config["phonemizer"] is None: + if self.tts_config["use_phonemes"] and self.tts_config["phonemizer"] is None: raise ValueError("Phonemizer is not defined in the TTS config.") + self.tts_model = setup_tts_model(config=self.tts_config) + if not self.encoder_checkpoint: self._set_speaker_encoder_paths_from_tts_config()