From dd4287de1fce944c77cef7498e0daf1a2154abfc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <egolge@coqui.ai>
Date: Thu, 3 Mar 2022 20:23:00 +0100
Subject: [PATCH] Update models

---
 TTS/.models.json                | 21 ++++++---------------
 TTS/tts/models/vits.py          |  4 ++--
 TTS/tts/utils/synthesis.py      |  2 +-
 TTS/tts/utils/text/tokenizer.py |  3 +++
 4 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/TTS/.models.json b/TTS/.models.json
index 2e6c0ebf..366358be 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -33,7 +33,7 @@
                 },
                 "tacotron2-DDC_ph": {
                     "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--ljspeech--tacotronDDC_ph.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
                     "default_vocoder": "vocoder_models/en/ljspeech/univnet",
                     "commit": "3900448",
                     "author": "Eren Gölge @erogol",
@@ -71,7 +71,7 @@
                 },
                 "vits": {
                     "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--ljspeech--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--vits.zip",
                     "default_vocoder": null,
                     "commit": "3900448",
                     "author": "Eren Gölge @erogol",
@@ -89,18 +89,9 @@
                 }
             },
             "vctk": {
-                "sc-glow-tts": {
-                    "description": "Multi-Speaker Transformers based SC-Glow model from https://arxiv.org/abs/2104.05557.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.0/tts_models--en--vctk--sc-glow-tts.zip",
-                    "default_vocoder": "vocoder_models/en/vctk/hifigan_v2",
-                    "commit": "b531fa69",
-                    "author": "Edresson Casanova",
-                    "license": "",
-                    "contact": ""
-                },
                 "vits": {
                     "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--vctk--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--vits.zip",
                     "default_vocoder": null,
                     "commit": "3900448",
                     "author": "Eren @erogol",
@@ -109,7 +100,7 @@
                 },
                 "fast_pitch":{
                     "description": "FastPitch model trained on VCTK dataseset.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--en--vctk--fast_pitch.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--fast_pitch.zip",
                     "default_vocoder": null,
                     "commit": "bdab788d",
                     "author": "Eren @erogol",
@@ -156,7 +147,7 @@
         "uk":{
             "mai": {
                 "glow-tts": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--uk--mailabs--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--uk--mai--glow-tts.zip",
                     "author":"@robinhad",
                     "commit": "bdab788d",
                     "license": "MIT",
@@ -168,7 +159,7 @@
         "zh-CN": {
             "baker": {
                 "tacotron2-DDC-GST": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
                     "commit": "unknown",
                     "author": "@kirianguiller",
                     "default_vocoder": null
diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py
index 1ad8807f..a43e081c 100644
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@@ -1470,7 +1470,7 @@ class Vits(BaseTTS):
         """
         from TTS.utils.audio import AudioProcessor
 
-        upsample_rate = math.prod(config.model_args.upsample_rates_decoder)
+        upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
         assert (
             upsample_rate == config.audio.hop_length
         ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
@@ -1480,7 +1480,7 @@ class Vits(BaseTTS):
         speaker_manager = SpeakerManager.init_from_config(config, samples)
         language_manager = LanguageManager.init_from_config(config)
 
-        if config.model_args.speaker_encoder_model_path is not None:
+        if config.model_args.speaker_encoder_model_path:
             speaker_manager.init_speaker_encoder(
                 config.model_args.speaker_encoder_model_path, config.model_args.speaker_encoder_config_path
             )
diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py
index 377f32de..4ec84a3d 100644
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@@ -167,7 +167,7 @@ def synthesis(
             style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
     # convert text to sequence of token IDs
     text_inputs = np.asarray(
-        model.tokenizer.text_to_ids(text),
+        model.tokenizer.text_to_ids(text, language=language_id),
         dtype=np.int32,
     )
     # pass tensors to backend
diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py
index 50a5f519..f0d85a44 100644
--- a/TTS/tts/utils/text/tokenizer.py
+++ b/TTS/tts/utils/text/tokenizer.py
@@ -93,6 +93,9 @@ class TTSTokenizer:
             language(str):
                 The language code of the text. Defaults to None.
 
+        TODO:
+            - Add support for language-specific processing.
+
         1. Text normalizatin
         2. Phonemization (if use_phonemes is True)
         3. Add blank char between characters