From dd4287de1fce944c77cef7498e0daf1a2154abfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 3 Mar 2022 20:23:00 +0100 Subject: [PATCH] Update models --- TTS/.models.json | 21 ++++++--------------- TTS/tts/models/vits.py | 4 ++-- TTS/tts/utils/synthesis.py | 2 +- TTS/tts/utils/text/tokenizer.py | 3 +++ 4 files changed, 12 insertions(+), 18 deletions(-) diff --git a/TTS/.models.json b/TTS/.models.json index 2e6c0ebf..366358be 100644 --- a/TTS/.models.json +++ b/TTS/.models.json @@ -33,7 +33,7 @@ }, "tacotron2-DDC_ph": { "description": "Tacotron2 with Double Decoder Consistency with phonemes.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--ljspeech--tacotronDDC_ph.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip", "default_vocoder": "vocoder_models/en/ljspeech/univnet", "commit": "3900448", "author": "Eren Gölge @erogol", @@ -71,7 +71,7 @@ }, "vits": { "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--ljspeech--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--vits.zip", "default_vocoder": null, "commit": "3900448", "author": "Eren Gölge @erogol", @@ -89,18 +89,9 @@ } }, "vctk": { - "sc-glow-tts": { - "description": "Multi-Speaker Transformers based SC-Glow model from https://arxiv.org/abs/2104.05557.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.0/tts_models--en--vctk--sc-glow-tts.zip", - "default_vocoder": "vocoder_models/en/vctk/hifigan_v2", - "commit": "b531fa69", - "author": "Edresson Casanova", - "license": "", - "contact": "" - }, "vits": { "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--vctk--vits.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--vits.zip", "default_vocoder": null, "commit": "3900448", "author": "Eren @erogol", @@ -109,7 +100,7 @@ }, "fast_pitch":{ "description": "FastPitch model trained on VCTK dataseset.", - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--en--vctk--fast_pitch.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--fast_pitch.zip", "default_vocoder": null, "commit": "bdab788d", "author": "Eren @erogol", @@ -156,7 +147,7 @@ "uk":{ "mai": { "glow-tts": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--uk--mailabs--glow-tts.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--uk--mai--glow-tts.zip", "author":"@robinhad", "commit": "bdab788d", "license": "MIT", @@ -168,7 +159,7 @@ "zh-CN": { "baker": { "tacotron2-DDC-GST": { - "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip", + "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip", "commit": "unknown", "author": "@kirianguiller", "default_vocoder": null diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 1ad8807f..a43e081c 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1470,7 +1470,7 @@ class Vits(BaseTTS): """ from TTS.utils.audio import AudioProcessor - upsample_rate = math.prod(config.model_args.upsample_rates_decoder) + upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item() assert ( upsample_rate == config.audio.hop_length ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}" @@ -1480,7 +1480,7 @@ class Vits(BaseTTS): speaker_manager = SpeakerManager.init_from_config(config, samples) language_manager = LanguageManager.init_from_config(config) - if config.model_args.speaker_encoder_model_path is not None: + if config.model_args.speaker_encoder_model_path: speaker_manager.init_speaker_encoder( config.model_args.speaker_encoder_model_path, config.model_args.speaker_encoder_config_path ) diff --git a/TTS/tts/utils/synthesis.py b/TTS/tts/utils/synthesis.py index 377f32de..4ec84a3d 100644 --- a/TTS/tts/utils/synthesis.py +++ b/TTS/tts/utils/synthesis.py @@ -167,7 +167,7 @@ def synthesis( style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda) # convert text to sequence of token IDs text_inputs = np.asarray( - model.tokenizer.text_to_ids(text), + model.tokenizer.text_to_ids(text, language=language_id), dtype=np.int32, ) # pass tensors to backend diff --git a/TTS/tts/utils/text/tokenizer.py b/TTS/tts/utils/text/tokenizer.py index 50a5f519..f0d85a44 100644 --- a/TTS/tts/utils/text/tokenizer.py +++ b/TTS/tts/utils/text/tokenizer.py @@ -93,6 +93,9 @@ class TTSTokenizer: language(str): The language code of the text. Defaults to None. + TODO: + - Add support for language-specific processing. + 1. Text normalizatin 2. Phonemization (if use_phonemes is True) 3. Add blank char between characters