From 59576fc0ecf7c86681741bf440a699f85c11bdc5 Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 20 Oct 2023 17:29:43 -0300 Subject: [PATCH 1/5] Bug fix on XTTS v1.1 inference (#3093) * Bug fix on XTTS v1.1 inference * Update .models.json --------- Co-authored-by: Julian Weber --- TTS/.models.json | 10 +++++----- TTS/tts/configs/xtts_config.py | 6 +++--- TTS/tts/models/xtts.py | 11 ++++++++--- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 8e35893b..0c318740 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -18,12 +18,12 @@ "xtts_v1.1": { "description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.", "hf_url": [ - "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1/model.pth", - "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1/config.json", - "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1/vocab.json", - "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1/hash.md5" + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/model.pth", + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/config.json", + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/vocab.json", + "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/hash.md5" ], - "model_hash": "10163afc541dc86801b33d1f3217b456", + "model_hash": "ae9e4b39e095fd5728fe7f7931ec66ad", "default_vocoder": null, "commit": "82910a63", "license": "CPML", diff --git a/TTS/tts/configs/xtts_config.py b/TTS/tts/configs/xtts_config.py index b9685590..4e5031ba 100644 --- a/TTS/tts/configs/xtts_config.py +++ b/TTS/tts/configs/xtts_config.py @@ -78,13 +78,13 @@ class XttsConfig(BaseTTSConfig): ) # inference params - temperature: float = 0.2 + temperature: float = 0.85 length_penalty: float = 1.0 repetition_penalty: float = 2.0 top_k: int = 50 - top_p: float = 0.8 + top_p: float = 0.85 cond_free_k: float = 2.0 diffusion_temperature: float = 1.0 - num_gpt_outputs: int = 16 + num_gpt_outputs: int = 1 decoder_iterations: int = 30 decoder_sampler: str = "ddim" diff --git a/TTS/tts/models/xtts.py b/TTS/tts/models/xtts.py index 76c5595e..40e8f946 100644 --- a/TTS/tts/models/xtts.py +++ b/TTS/tts/models/xtts.py @@ -821,8 +821,6 @@ class Xtts(BaseTTS): self.tokenizer = VoiceBpeTokenizer(vocab_file=vocab_path) self.init_models() - if eval: - self.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache) checkpoint = load_fsspec(model_path, map_location=torch.device("cpu"))["model"] ignore_keys = ["diffusion_decoder", "vocoder"] if self.args.use_hifigan or self.args.use_ne_hifigan else [] @@ -831,7 +829,14 @@ class Xtts(BaseTTS): for key in list(checkpoint.keys()): if key.split(".")[0] in ignore_keys: del checkpoint[key] - self.load_state_dict(checkpoint, strict=strict) + + # deal with v1 and v1.1. V1 has the init_gpt_for_inference keys, v1.1 do not + try: + self.load_state_dict(checkpoint, strict=strict) + except: + if eval: + self.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache) + self.load_state_dict(checkpoint, strict=strict) if eval: if hasattr(self, "hifigan_decoder"): self.hifigan_decoder.eval() From 414f0de0a1e89d871ccbec4a245be83c2afe883f Mon Sep 17 00:00:00 2001 From: Edresson Casanova Date: Fri, 20 Oct 2023 17:30:58 -0300 Subject: [PATCH 2/5] Bump up to v0.18.1 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 66333910..249afd51 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.18.0 +0.18.1 From c7a16042e3c4004862a74dc599b4278ee85a2e7e Mon Sep 17 00:00:00 2001 From: Julian Weber Date: Sat, 21 Oct 2023 11:18:58 +0200 Subject: [PATCH 3/5] Remove global cutlet import --- TTS/tts/layers/xtts/tokenizer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index f34a7ac0..1d4ed235 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -6,7 +6,6 @@ import torch from tokenizers import Tokenizer import pypinyin -import cutlet from num2words import num2words from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words From dad6a7b0b6bba9cf5cc0c3c72c7b29e0905609db Mon Sep 17 00:00:00 2001 From: Julian Weber Date: Sat, 21 Oct 2023 11:26:03 +0200 Subject: [PATCH 4/5] Preserve [ja] token of the text processing --- TTS/tts/layers/xtts/tokenizer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 1d4ed235..4b9fb9ed 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -483,10 +483,13 @@ class VoiceBpeTokenizer: if lang == "zh-cn": txt = chinese_transliterate(txt) elif lang == "ja": + assert txt[:4] == "[ja]", "Japanese speech should start with the [ja] token." + txt = txt[4:] if self.katsu is None: import cutlet self.katsu = cutlet.Cutlet() txt = japanese_cleaners(txt, self.katsu) + txt = "[ja]" + txt else: raise NotImplementedError() return txt From 1e152692ed4662d0d17c65aba6d0e0c5d1cbfa44 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Sat, 21 Oct 2023 17:29:53 +0200 Subject: [PATCH 5/5] Bump up to v0.18.2 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 249afd51..503a21de 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.18.1 +0.18.2