Merge pull request #4 from idiap/hindi

feat(xtts): support Hindi for sentence-splitting and fine-tuning
2024-04-11 16:49:44 +02:00 · 2024-04-11 16:49:44 +02:00 · 2ad790d169
parent dfbe0168e9 d41686502e
commit 2ad790d169
3 changed files with 26 additions and 4 deletions
--- a/TTS/demos/xtts_ft_demo/xtts_demo.py
+++ b/TTS/demos/xtts_ft_demo/xtts_demo.py
@ -192,6 +192,7 @@ if __name__ == "__main__":
                    "hu",
                    "ko",
                    "ja",
+                    "hi",
                ],
            )
            progress_data = gr.Label(label="Progress:")
@ -370,6 +371,7 @@ if __name__ == "__main__":
                            "hu",
                            "ko",
                            "ja",
+                            "hi",
                        ],
                    )
                    tts_text = gr.Textbox(
--- a/TTS/tts/layers/xtts/tokenizer.py
+++ b/TTS/tts/layers/xtts/tokenizer.py
@ -12,6 +12,7 @@ from num2words import num2words
 from spacy.lang.ar import Arabic
 from spacy.lang.en import English
 from spacy.lang.es import Spanish
+from spacy.lang.hi import Hindi
 from spacy.lang.ja import Japanese
 from spacy.lang.zh import Chinese
 from tokenizers import Tokenizer
@ -22,6 +23,7 @@ logger = logging.getLogger(__name__)


 def get_spacy_lang(lang):
+    """Return Spacy language used for sentence splitting."""
    if lang == "zh":
        return Chinese()
    elif lang == "ja":
@ -30,8 +32,10 @@ def get_spacy_lang(lang):
        return Arabic()
    elif lang == "es":
        return Spanish()
+    elif lang == "hi":
+        return Hindi()
    else:
-        # For most languages, Enlish does the job
+        # For most languages, English does the job
        return English()


@ -614,6 +618,7 @@ class VoiceBpeTokenizer:
            "ja": 71,
            "hu": 224,
            "ko": 95,
+            "hi": 150,
        }

    @cached_property
--- a/docs/source/models/xtts.md
+++ b/docs/source/models/xtts.md
@ -14,16 +14,31 @@ There is no need for an excessive amount of training data that spans countless h
 ### Updates with v2
 - Improved voice cloning.
 - Voices can be cloned with a single audio file or multiple audio files, without any effect on the runtime.
- 2 new languages: Hungarian and Korean.
 - Across the board quality improvements.

 ### Code
 Current implementation only supports inference and GPT encoder training.

 ### Languages
-As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko).
+XTTS-v2 supports 17 languages:

-Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out.
+- Arabic (ar)
+- Chinese (zh-cn)
+- Czech (cs)
+- Dutch (nl)
+- English (en)
+- French (fr)
+- German (de)
+- Hindi (hi)
+- Hungarian (hu)
+- Italian (it)
+- Japanese (ja)
+- Korean (ko)
+- Polish (pl)
+- Portuguese (pt)
+- Russian (ru)
+- Spanish (es)
+- Turkish (tr)

 ### License
 This model is licensed under [Coqui Public Model License](https://coqui.ai/cpml).