Update models

2022-03-03 20:23:00 +01:00 · 2022-03-03 20:23:00 +01:00 · dd4287de1f
parent 6cb00be795
commit dd4287de1f
4 changed files with 12 additions and 18 deletions
--- a/TTS/.models.json
+++ b/TTS/.models.json
@ -33,7 +33,7 @@
                },
                "tacotron2-DDC_ph": {
                    "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--ljspeech--tacotronDDC_ph.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
                    "default_vocoder": "vocoder_models/en/ljspeech/univnet",
                    "commit": "3900448",
                    "author": "Eren Gölge @erogol",
@ -71,7 +71,7 @@
                },
                "vits": {
                    "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--ljspeech--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--ljspeech--vits.zip",
                    "default_vocoder": null,
                    "commit": "3900448",
                    "author": "Eren Gölge @erogol",
@ -89,18 +89,9 @@
                }
            },
            "vctk": {
-                "sc-glow-tts": {
-                    "description": "Multi-Speaker Transformers based SC-Glow model from https://arxiv.org/abs/2104.05557.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.1.0/tts_models--en--vctk--sc-glow-tts.zip",
-                    "default_vocoder": "vocoder_models/en/vctk/hifigan_v2",
-                    "commit": "b531fa69",
-                    "author": "Edresson Casanova",
-                    "license": "",
-                    "contact": ""
-                },
                "vits": {
                    "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.2.0/tts_models--en--vctk--vits.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--vits.zip",
                    "default_vocoder": null,
                    "commit": "3900448",
                    "author": "Eren @erogol",
@ -109,7 +100,7 @@
                },
                "fast_pitch":{
                    "description": "FastPitch model trained on VCTK dataseset.",
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--en--vctk--fast_pitch.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--en--vctk--fast_pitch.zip",
                    "default_vocoder": null,
                    "commit": "bdab788d",
                    "author": "Eren @erogol",
@ -156,7 +147,7 @@
        "uk":{
            "mai": {
                "glow-tts": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.4.0/tts_models--uk--mailabs--glow-tts.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--uk--mai--glow-tts.zip",
                    "author":"@robinhad",
                    "commit": "bdab788d",
                    "license": "MIT",
@ -168,7 +159,7 @@
        "zh-CN": {
            "baker": {
                "tacotron2-DDC-GST": {
-                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.0.10/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
+                    "github_rls_url": "https://coqui.gateway.scarf.sh/v0.6.0_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
                    "commit": "unknown",
                    "author": "@kirianguiller",
                    "default_vocoder": null
--- a/TTS/tts/models/vits.py
+++ b/TTS/tts/models/vits.py
@ -1470,7 +1470,7 @@ class Vits(BaseTTS):
        """
        from TTS.utils.audio import AudioProcessor

-        upsample_rate = math.prod(config.model_args.upsample_rates_decoder)
+        upsample_rate = torch.prod(torch.as_tensor(config.model_args.upsample_rates_decoder)).item()
        assert (
            upsample_rate == config.audio.hop_length
        ), f" [!] Product of upsample rates must be equal to the hop length - {upsample_rate} vs {config.audio.hop_length}"
@ -1480,7 +1480,7 @@ class Vits(BaseTTS):
        speaker_manager = SpeakerManager.init_from_config(config, samples)
        language_manager = LanguageManager.init_from_config(config)

-        if config.model_args.speaker_encoder_model_path is not None:
+        if config.model_args.speaker_encoder_model_path:
            speaker_manager.init_speaker_encoder(
                config.model_args.speaker_encoder_model_path, config.model_args.speaker_encoder_config_path
            )
--- a/TTS/tts/utils/synthesis.py
+++ b/TTS/tts/utils/synthesis.py
@ -167,7 +167,7 @@ def synthesis(
            style_mel = compute_style_mel(style_wav, model.ap, cuda=use_cuda)
    # convert text to sequence of token IDs
    text_inputs = np.asarray(
-        model.tokenizer.text_to_ids(text),
+        model.tokenizer.text_to_ids(text, language=language_id),
        dtype=np.int32,
    )
    # pass tensors to backend
--- a/TTS/tts/utils/text/tokenizer.py
+++ b/TTS/tts/utils/text/tokenizer.py
@ -93,6 +93,9 @@ class TTSTokenizer:
            language(str):
                The language code of the text. Defaults to None.

+        TODO:
+            - Add support for language-specific processing.
+
        1. Text normalizatin
        2. Phonemization (if use_phonemes is True)
        3. Add blank char between characters