From 59576fc0ecf7c86681741bf440a699f85c11bdc5 Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Fri, 20 Oct 2023 17:29:43 -0300
Subject: [PATCH 1/5] Bug fix on XTTS v1.1 inference (#3093)

* Bug fix on XTTS v1.1 inference

* Update .models.json

---------

Co-authored-by: Julian Weber <julian.weber@hotmail.fr>
---
 TTS/.models.json               | 10 +++++-----
 TTS/tts/configs/xtts_config.py |  6 +++---
 TTS/tts/models/xtts.py         | 11 ++++++++---
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/TTS/.models.json b/TTS/.models.json
index 8e35893b..0c318740 100644
--- a/TTS/.models.json
+++ b/TTS/.models.json
@@ -18,12 +18,12 @@
                 "xtts_v1.1": {
                     "description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
                     "hf_url": [
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1/model.pth",
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1/config.json",
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1/vocab.json",
-                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1/hash.md5"
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/model.pth",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/config.json",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/vocab.json",
+                        "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/hash.md5"
                     ],
-                    "model_hash": "10163afc541dc86801b33d1f3217b456",
+                    "model_hash": "ae9e4b39e095fd5728fe7f7931ec66ad",
                     "default_vocoder": null,
                     "commit": "82910a63",
                     "license": "CPML",
diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py
index b9685590..4e5031ba 100644
--- a/TTS/tts/configs/xtts_config.py
+++ b/TTS/tts/configs/xtts_config.py
@@ -78,13 +78,13 @@ class XttsConfig(BaseTTSConfig):
     )
 
     # inference params
-    temperature: float = 0.2
+    temperature: float = 0.85
     length_penalty: float = 1.0
     repetition_penalty: float = 2.0
     top_k: int = 50
-    top_p: float = 0.8
+    top_p: float = 0.85
     cond_free_k: float = 2.0
     diffusion_temperature: float = 1.0
-    num_gpt_outputs: int = 16
+    num_gpt_outputs: int = 1
     decoder_iterations: int = 30
     decoder_sampler: str = "ddim"
diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py
index 76c5595e..40e8f946 100644
--- a/TTS/tts/models/xtts.py
+++ b/TTS/tts/models/xtts.py
@@ -821,8 +821,6 @@ class Xtts(BaseTTS):
             self.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path)
 
         self.init_models()
-        if eval:
-            self.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache)
 
         checkpoint = load_fsspec(model_path, map_location=torch.device("cpu"))["model"]
         ignore_keys = ["diffusion_decoder", "vocoder"] if self.args.use_hifigan or self.args.use_ne_hifigan else []
@@ -831,7 +829,14 @@ class Xtts(BaseTTS):
         for key in list(checkpoint.keys()):
             if key.split(".")[0] in ignore_keys:
                 del checkpoint[key]
-        self.load_state_dict(checkpoint, strict=strict)
+
+        # deal with v1 and v1.1. V1 has the init_gpt_for_inference keys, v1.1 do not
+        try:
+            self.load_state_dict(checkpoint, strict=strict)
+        except:
+            if eval:
+                self.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache)
+            self.load_state_dict(checkpoint, strict=strict)
 
         if eval:
             if hasattr(self, "hifigan_decoder"): self.hifigan_decoder.eval()

From 414f0de0a1e89d871ccbec4a245be83c2afe883f Mon Sep 17 00:00:00 2001
From: Edresson Casanova <edresson1@gmail.com>
Date: Fri, 20 Oct 2023 17:30:58 -0300
Subject: [PATCH 2/5] Bump up to v0.18.1

---
 TTS/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/VERSION b/TTS/VERSION
index 66333910..249afd51 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.18.0
+0.18.1

From c7a16042e3c4004862a74dc599b4278ee85a2e7e Mon Sep 17 00:00:00 2001
From: Julian Weber <julian.weber@hotmail.fr>
Date: Sat, 21 Oct 2023 11:18:58 +0200
Subject: [PATCH 3/5] Remove global cutlet import

---
 TTS/tts/layers/xtts/tokenizer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py
index f34a7ac0..1d4ed235 100644
--- a/TTS/tts/layers/xtts/tokenizer.py
+++ b/TTS/tts/layers/xtts/tokenizer.py
@@ -6,7 +6,6 @@ import torch
 from tokenizers import Tokenizer
 
 import pypinyin
-import cutlet
 from num2words import num2words
 from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
 

From dad6a7b0b6bba9cf5cc0c3c72c7b29e0905609db Mon Sep 17 00:00:00 2001
From: Julian Weber <julian.weber@hotmail.fr>
Date: Sat, 21 Oct 2023 11:26:03 +0200
Subject: [PATCH 4/5] Preserve [ja] token of the text processing

---
 TTS/tts/layers/xtts/tokenizer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py
index 1d4ed235..4b9fb9ed 100644
--- a/TTS/tts/layers/xtts/tokenizer.py
+++ b/TTS/tts/layers/xtts/tokenizer.py
@@ -483,10 +483,13 @@ class VoiceBpeTokenizer:
             if lang == "zh-cn":
                 txt = chinese_transliterate(txt)
         elif lang == "ja":
+            assert txt[:4] == "[ja]", "Japanese speech should start with the [ja] token."
+            txt = txt[4:]
             if self.katsu is None:
                 import cutlet
                 self.katsu = cutlet.Cutlet()
             txt = japanese_cleaners(txt, self.katsu)
+            txt = "[ja]" + txt
         else:
             raise NotImplementedError()
         return txt

From 1e152692ed4662d0d17c65aba6d0e0c5d1cbfa44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Sat, 21 Oct 2023 17:29:53 +0200
Subject: [PATCH 5/5] Bump up to v0.18.2

---
 TTS/VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TTS/VERSION b/TTS/VERSION
index 249afd51..503a21de 100644
--- a/TTS/VERSION
+++ b/TTS/VERSION
@@ -1 +1 @@
-0.18.1
+0.18.2