From d41686502e3003b6472ad769115dfd710059a87d Mon Sep 17 00:00:00 2001 From: Enno Hermann Date: Mon, 8 Apr 2024 12:06:45 +0200 Subject: [PATCH] feat(xtts): support hindi for sentence-splitting and fine-tuning The XTTS model itself already supports Hindi, it was just in these components. --- TTS/demos/xtts_ft_demo/xtts_demo.py | 2 ++ TTS/tts/layers/xtts/tokenizer.py | 7 ++++++- docs/source/models/xtts.md | 21 ++++++++++++++++++--- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/TTS/demos/xtts_ft_demo/xtts_demo.py b/TTS/demos/xtts_ft_demo/xtts_demo.py index 85168c64..7ac38ed6 100644 --- a/TTS/demos/xtts_ft_demo/xtts_demo.py +++ b/TTS/demos/xtts_ft_demo/xtts_demo.py @@ -192,6 +192,7 @@ if __name__ == "__main__": "hu", "ko", "ja", + "hi", ], ) progress_data = gr.Label(label="Progress:") @@ -370,6 +371,7 @@ if __name__ == "__main__": "hu", "ko", "ja", + "hi", ], ) tts_text = gr.Textbox( diff --git a/TTS/tts/layers/xtts/tokenizer.py b/TTS/tts/layers/xtts/tokenizer.py index 1a3cc47a..6cbd374f 100644 --- a/TTS/tts/layers/xtts/tokenizer.py +++ b/TTS/tts/layers/xtts/tokenizer.py @@ -11,6 +11,7 @@ from num2words import num2words from spacy.lang.ar import Arabic from spacy.lang.en import English from spacy.lang.es import Spanish +from spacy.lang.hi import Hindi from spacy.lang.ja import Japanese from spacy.lang.zh import Chinese from tokenizers import Tokenizer @@ -19,6 +20,7 @@ from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words def get_spacy_lang(lang): + """Return Spacy language used for sentence splitting.""" if lang == "zh": return Chinese() elif lang == "ja": @@ -27,8 +29,10 @@ def get_spacy_lang(lang): return Arabic() elif lang == "es": return Spanish() + elif lang == "hi": + return Hindi() else: - # For most languages, Enlish does the job + # For most languages, English does the job return English() @@ -611,6 +615,7 @@ class VoiceBpeTokenizer: "ja": 71, "hu": 224, "ko": 95, + "hi": 150, } @cached_property diff --git a/docs/source/models/xtts.md b/docs/source/models/xtts.md index de166741..cc7c36b7 100644 --- a/docs/source/models/xtts.md +++ b/docs/source/models/xtts.md @@ -14,16 +14,31 @@ There is no need for an excessive amount of training data that spans countless h ### Updates with v2 - Improved voice cloning. - Voices can be cloned with a single audio file or multiple audio files, without any effect on the runtime. -- 2 new languages: Hungarian and Korean. - Across the board quality improvements. ### Code Current implementation only supports inference and GPT encoder training. ### Languages -As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko). +XTTS-v2 supports 17 languages: -Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out. +- Arabic (ar) +- Chinese (zh-cn) +- Czech (cs) +- Dutch (nl) +- English (en) +- French (fr) +- German (de) +- Hindi (hi) +- Hungarian (hu) +- Italian (it) +- Japanese (ja) +- Korean (ko) +- Polish (pl) +- Portuguese (pt) +- Russian (ru) +- Spanish (es) +- Turkish (tr) ### License This model is licensed under [Coqui Public Model License](https://coqui.ai/cpml).