From 1b22f03e986134bcbcd2aba72fe8e226e07f5b9f Mon Sep 17 00:00:00 2001
From: WeberJulian <julian.weber@hotmail.fr>
Date: Wed, 30 Mar 2022 12:47:11 +0200
Subject: [PATCH] Fix G2P backend of the released models (#1461)

* Fix enforce phonemizer

* Add new models

* Fix .model.json
---
 TTS/.models.json         | 76 ++++++++++++++++++++--------------------
 TTS/utils/synthesizer.py |  7 ++--
 2 files changed, 41 insertions(+), 42 deletions(-)

diff --git a/TTS/.models.json b/TTS/.models.json
index 801b8468..24838a5d 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -4,7 +4,7 @@
             "multi-dataset":{
                 "your_tts":{
                     "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--multilingual--multi-dataset--your_tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
                     "default_vocoder": null,
                     "commit": "e9a1953e",
                     "license": "CC BY-NC-ND 4.0",
@@ -16,7 +16,7 @@
             "ek1": {
                 "tacotron2": {
                     "description": "EK1 en-rp tacotron2 by NMStoker",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.0/tts_models--en--ek1--tacotron2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
                     "default_vocoder": "vocoder_models/en/ek1/wavegrad",
                     "commit": "c802255"
                 }
@@ -24,7 +24,7 @@
             "ljspeech": {
                 "tacotron2-DDC": {
                     "description": "Tacotron2 with Double Decoder Consistency.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/tts_models--en--ljspeech--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
                     "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
                     "commit": "bae2ad0f",
                     "author": "Eren Gölge @erogol",
@@ -33,7 +33,7 @@
                 },
                 "tacotron2-DDC_ph": {
                     "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
                     "default_vocoder": "vocoder_models/en/ljspeech/univnet",
                     "commit": "3900448",
                     "author": "Eren Gölge @erogol",
@@ -42,7 +42,7 @@
                 },
                 "glow-tts": {
                     "description": "",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
                     "stats_file": null,
                     "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
                     "commit": "",
@@ -52,7 +52,7 @@
                 },
                 "speedy-speech": {
                     "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/tts_models--en--ljspeech--speedy_speech.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
                     "stats_file": null,
                     "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
                     "commit": "4581e3d",
@@ -62,7 +62,7 @@
                 },
                 "tacotron2-DCA": {
                     "description": "",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--en--ljspeech--tacotron2-DCA.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
                     "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
                     "commit": "",
                     "author": "Eren Gölge @erogol",
@@ -71,7 +71,7 @@
                 },
                 "vits": {
                     "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
                     "default_vocoder": null,
                     "commit": "3900448",
                     "author": "Eren Gölge @erogol",
@@ -80,7 +80,7 @@
                 },
                 "fast_pitch": {
                     "description": "FastPitch model trained on LJSpeech using the Aligner Network",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.2/tts_models--en--ljspeech--fast_pitch.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
                     "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
                     "commit": "b27b3ba",
                     "author": "Eren Gölge @erogol",
@@ -91,7 +91,7 @@
             "vctk": {
                 "vits": {
                     "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--vits.zip",
                     "default_vocoder": null,
                     "commit": "3900448",
                     "author": "Eren @erogol",
@@ -100,7 +100,7 @@
                 },
                 "fast_pitch":{
                     "description": "FastPitch model trained on VCTK dataseset.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--fast_pitch.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
                     "default_vocoder": null,
                     "commit": "bdab788d",
                     "author": "Eren @erogol",
@@ -111,7 +111,7 @@
             "sam": {
                 "tacotron-DDC": {
                     "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/tts_models--en--sam--tacotron_DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
                     "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
                     "commit": "bae2ad0f",
                     "author": "Eren Gölge @erogol",
@@ -123,7 +123,7 @@
         "es": {
             "mai": {
                 "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--es--mai--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
                     "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
                     "commit": "",
                     "author": "Eren Gölge @erogol",
@@ -135,7 +135,7 @@
         "fr": {
             "mai": {
                 "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/tts_models--fr--mai--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
                     "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
                     "commit": "",
                     "author": "Eren Gölge @erogol",
@@ -147,7 +147,7 @@
         "uk":{
             "mai": {
                 "glow-tts": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--uk--mai--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
                     "author":"@robinhad",
                     "commit": "bdab788d",
                     "license": "MIT",
@@ -159,7 +159,7 @@
         "zh-CN": {
             "baker": {
                 "tacotron2-DDC-GST": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
                     "commit": "unknown",
                     "author": "@kirianguiller",
                     "default_vocoder": null
@@ -169,7 +169,7 @@
         "nl": {
             "mai": {
                 "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/tts_models--nl--mai--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
                     "author": "@r-dh",
                     "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
                     "stats_file": null,
@@ -180,7 +180,7 @@
         "de": {
             "thorsten": {
                 "tacotron2-DCA": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/tts_models--de--thorsten--tacotron2-DCA.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
                     "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
                     "author": "@thorstenMueller",
                     "commit": "unknown"
@@ -190,7 +190,7 @@
         "ja": {
             "kokoro": {
                 "tacotron2-DDC": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.15/tts_models--jp--kokoro--tacotron2-DDC.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
                     "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
                     "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
                     "author": "@kaiidams",
@@ -201,7 +201,7 @@
         "tr":{
             "common-voice": {
                 "glow-tts":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--tr--common-voice--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
                     "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
                     "license": "MIT",
                     "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
@@ -213,14 +213,14 @@
         "it": {
             "mai_female": {
                 "glow-tts":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
                     "default_vocoder": null,
                     "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                     "author": "@nicolalandro",
                     "commit": null
                 },
                 "vits":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_female--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_female--vits.zip",
                     "default_vocoder": null,
                     "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                     "author": "@nicolalandro",
@@ -229,14 +229,14 @@
             },
             "mai_male": {
                 "glow-tts":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
                     "default_vocoder": null,
                     "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                     "author": "@nicolalandro",
                     "commit": null
                 },
                 "vits":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--it--mai_male--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/tts_models--it--mai_male--vits.zip",
                     "default_vocoder": null,
                     "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
                     "author": "@nicolalandro",
@@ -249,14 +249,14 @@
         "universal": {
             "libri-tts": {
                 "wavegrad": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--wavegrad.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
                     "commit": "ea976b0",
                     "author": "Eren Gölge @erogol",
                     "license": "MPL",
                     "contact": "egolge@coqui.com"
                 },
                 "fullband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--universal--libri-tts--fullband-melgan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
                     "commit": "4132240",
                     "author": "Eren Gölge @erogol",
                     "license": "MPL",
@@ -268,13 +268,13 @@
             "ek1": {
                 "wavegrad": {
                     "description": "EK1 en-rp wavegrad by NMStoker",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--en--ek1--wavegrad.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
                     "commit": "c802255"
                 }
             },
             "ljspeech": {
                 "multiband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.9/vocoder_models--en--ljspeech--mulitband-melgan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
                     "commit": "ea976b0",
                     "author": "Eren Gölge @erogol",
                     "license": "MPL",
@@ -282,7 +282,7 @@
                 },
                 "hifigan_v2": {
                     "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--ljspeech-hifigan_v2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
                     "commit": "bae2ad0f",
                     "author": "@erogol",
                     "license": "",
@@ -290,7 +290,7 @@
                 },
                 "univnet": {
                     "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.3.0/vocoder_models--en--ljspeech--univnet_v2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
                     "commit": "4581e3d",
                     "author": "Eren @erogol",
                     "license": "TBD",
@@ -300,7 +300,7 @@
             "vctk": {
                 "hifigan_v2": {
                     "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.12/vocoder_model--en--vctk--hifigan_v2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
                     "commit": "2f07160",
                     "author": "Edresson Casanova",
                     "license": "",
@@ -310,7 +310,7 @@
             "sam": {
                 "hifigan_v2": {
                     "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.13/vocoder_models--en--sam--hifigan_v2.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
                     "commit": "2f07160",
                     "author": "Eren Gölge @erogol",
                     "license": "",
@@ -321,7 +321,7 @@
         "nl": {
             "mai": {
                 "parallel-wavegan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/vocoder_models--nl--mai--parallel-wavegan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
                     "author": "@r-dh",
                     "commit": "unknown"
                 }
@@ -330,12 +330,12 @@
         "de": {
             "thorsten": {
                 "wavegrad": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.11/vocoder_models--de--thorsten--wavegrad.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
                     "author": "@thorstenMueller",
                     "commit": "unknown"
                 },
                 "fullband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.3/vocoder_models--de--thorsten--fullband-melgan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
                     "author": "@thorstenMueller",
                     "commit": "unknown"
                 }
@@ -344,7 +344,7 @@
         "ja": {
             "kokoro": {
                 "hifigan_v1": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/vocoder_models--ja--kokoro--hifigan_v1.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
                     "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
                     "author": "@kaiidams",
                     "commit": "3900448"
@@ -354,7 +354,7 @@
         "uk": {
             "mai": {
                 "multiband-melgan": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.5.0_models/vocoder_models--uk--mai--multiband-melgan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
                     "author":"@robinhad",
                     "commit": "bdab788d",
                     "license": "MIT",
@@ -365,7 +365,7 @@
         "tr":{
             "common-voice": {
                 "hifigan":{
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/vocoder_models--tr--common-voice--hifigan.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
                     "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
                     "author": "Fatih Akademi",
                     "license": "MIT",
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index 3dd8be44..eef4086c 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -109,12 +109,11 @@ class Synthesizer(object):
         """
         # pylint: disable=global-statement
         self.tts_config = load_config(tts_config_path)
-        self.use_phonemes = self.tts_config.use_phonemes
-        self.tts_model = setup_tts_model(config=self.tts_config)
-
-        if self.use_phonemes and self.tts_config["phonemizer"] is None:
+        if self.tts_config["use_phonemes"] and self.tts_config["phonemizer"] is None:
             raise ValueError("Phonemizer is not defined in the TTS config.")
 
+        self.tts_model = setup_tts_model(config=self.tts_config)
+
         if not self.encoder_checkpoint:
             self._set_speaker_encoder_paths_from_tts_config()