From fa0cbd78fea12467d7bb2a3a3407dc3530186600 Mon Sep 17 00:00:00 2001 From: Jake Tae Date: Mon, 28 Aug 2023 05:19:00 -0400 Subject: [PATCH 1/6] Update README with new device API (#2876) * docs: update readme w/ .to(device) api * docs: add .to(device) in python quickstart * docs: move section header out of comment * chore: use device instead of hard-coded string * docs: update inference.md --- README.md | 21 ++++++++++++--------- docs/source/inference.md | 12 ++++++------ 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 9d82eca6..6697a192 100644 --- a/README.md +++ b/README.md @@ -187,18 +187,21 @@ More details about the docker images (like GPU support) can be found [here](http ### 🐍 Python API +#### Running a multi-speaker and multi-lingual model + ```python +import torch from TTS.api import TTS -# Running a multi-speaker and multi-lingual model +# Get device +device = "cuda" if torch.cuda.is_available() else "cpu" # List available 🐸TTS models and choose the first one -model_name = TTS.list_models()[0] +model_name = TTS().list_models()[0] # Init TTS -tts = TTS(model_name) +tts = TTS(model_name).to(device) # Run TTS - # ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language # Text to speech with a numpy output wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0]) @@ -210,13 +213,13 @@ tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.langu ```python # Init TTS with the target model name -tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False) +tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False).to(device) + # Run TTS tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH) # Example voice cloning with YourTTS in English, French and Portuguese - -tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True) +tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to(device) tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav") tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav") tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav") @@ -227,7 +230,7 @@ tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", Converting the voice in `source_wav` to the voice of `target_wav` ```python -tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True) +tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to("cuda") tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav") ``` @@ -256,7 +259,7 @@ These models will follow the naming convention `coqui_studio/en//coqui_studio models = TTS().list_models() # Init TTS with the target studio speaker -tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False) +tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False) # Run TTS tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH) # Run TTS with emotion and speed control @@ -222,7 +222,7 @@ You can find the list of language ISO codes [here](https://dl.fbaipublicfiles.co ```python from TTS.api import TTS -api = TTS(model_name="tts_models/eng/fairseq/vits", gpu=True) +api = TTS(model_name="tts_models/eng/fairseq/vits").to("cuda") api.tts_to_file("This is a test.", file_path="output.wav") # TTS with on the fly voice conversion From b79b6f076226ff7d95eea47c3c3a62e0665e17a0 Mon Sep 17 00:00:00 2001 From: Jake Tae Date: Mon, 28 Aug 2023 05:20:12 -0400 Subject: [PATCH 2/6] feature: add device flag to tts cli (#2875) --- TTS/bin/synthesize.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index d4350cd5..5ded3067 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -169,6 +169,7 @@ If you don't specify any models, then it uses LJSpeech based English model. help="Output wav file path.", ) parser.add_argument("--use_cuda", type=bool, help="Run model on CUDA.", default=False) + parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu") parser.add_argument( "--vocoder_path", type=str, @@ -391,6 +392,10 @@ If you don't specify any models, then it uses LJSpeech based English model. if args.encoder_path is not None: encoder_path = args.encoder_path encoder_config_path = args.encoder_config_path + + device = args.device + if args.use_cuda: + device = "cuda" # load models synthesizer = Synthesizer( @@ -406,8 +411,7 @@ If you don't specify any models, then it uses LJSpeech based English model. vc_config_path, model_dir, args.voice_dir, - args.use_cuda, - ) + ).to(device) # query speaker ids of a multi-speaker model. if args.list_speaker_idxs: From fead04f779c26b6bc99a608542ecea8d39f0fbc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ale=C5=9B=20Bu=C5=82oj=C4=8Dyk?= Date: Mon, 28 Aug 2023 12:20:45 +0300 Subject: [PATCH 3/6] Add phonemizer for Belarusian language (#2856) --- TTS/tts/utils/text/belarusian/__init__.py | 0 TTS/tts/utils/text/belarusian/phonemizer.py | 34 ++++++++++++ TTS/tts/utils/text/phonemizers/__init__.py | 4 ++ .../text/phonemizers/belarusian_phonemizer.py | 55 +++++++++++++++++++ recipes/bel-alex73/train_glowtts.py | 4 +- .../text_tests/test_belarusian_phonemizer.py | 29 ++++++++++ 6 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 TTS/tts/utils/text/belarusian/__init__.py create mode 100644 TTS/tts/utils/text/belarusian/phonemizer.py create mode 100644 TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py create mode 100644 tests/text_tests/test_belarusian_phonemizer.py diff --git a/TTS/tts/utils/text/belarusian/__init__.py b/TTS/tts/utils/text/belarusian/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/TTS/tts/utils/text/belarusian/phonemizer.py b/TTS/tts/utils/text/belarusian/phonemizer.py new file mode 100644 index 00000000..3c07a209 --- /dev/null +++ b/TTS/tts/utils/text/belarusian/phonemizer.py @@ -0,0 +1,34 @@ +import os + +finder = None + + +def init(): + try: + import jpype + import jpype.imports + except ModuleNotFoundError: + raise ModuleNotFoundError("Belarusian phonemizer requires to install module 'jpype1' manually. Try `pip install jpype1`.") + + try: + jar_path = os.environ["BEL_FANETYKA_JAR"] + except KeyError: + raise KeyError("You need to define 'BEL_FANETYKA_JAR' environment variable as path to the fanetyka.jar file") + + jpype.startJVM(classpath=[jar_path]) + + # import the Java modules + from org.alex73.korpus.base import GrammarDB2, GrammarFinder + + grammar_db = GrammarDB2.initializeFromJar() + global finder + finder = GrammarFinder(grammar_db) + + +def belarusian_text_to_phonemes(text: str) -> str: + # Initialize only on first run + if finder is None: + init() + + from org.alex73.fanetyka.impl import FanetykaText + return str(FanetykaText(finder, text).ipa) diff --git a/TTS/tts/utils/text/phonemizers/__init__.py b/TTS/tts/utils/text/phonemizers/__init__.py index 5c285731..638184fd 100644 --- a/TTS/tts/utils/text/phonemizers/__init__.py +++ b/TTS/tts/utils/text/phonemizers/__init__.py @@ -1,4 +1,5 @@ from TTS.tts.utils.text.phonemizers.bangla_phonemizer import BN_Phonemizer +from TTS.tts.utils.text.phonemizers.belarusian_phonemizer import BEL_Phonemizer from TTS.tts.utils.text.phonemizers.base import BasePhonemizer from TTS.tts.utils.text.phonemizers.espeak_wrapper import ESpeak from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut @@ -35,6 +36,7 @@ DEF_LANG_TO_PHONEMIZER["en"] = DEF_LANG_TO_PHONEMIZER["en-us"] DEF_LANG_TO_PHONEMIZER["zh-cn"] = ZH_CN_Phonemizer.name() DEF_LANG_TO_PHONEMIZER["ko-kr"] = KO_KR_Phonemizer.name() DEF_LANG_TO_PHONEMIZER["bn"] = BN_Phonemizer.name() +DEF_LANG_TO_PHONEMIZER["be"] = BEL_Phonemizer.name() # JA phonemizer has deal breaking dependencies like MeCab for some systems. @@ -68,6 +70,8 @@ def get_phonemizer_by_name(name: str, **kwargs) -> BasePhonemizer: return KO_KR_Phonemizer(**kwargs) if name == "bn_phonemizer": return BN_Phonemizer(**kwargs) + if name == "be_phonemizer": + return BEL_Phonemizer(**kwargs) raise ValueError(f"Phonemizer {name} not found") diff --git a/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py b/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py new file mode 100644 index 00000000..fb620766 --- /dev/null +++ b/TTS/tts/utils/text/phonemizers/belarusian_phonemizer.py @@ -0,0 +1,55 @@ +from typing import Dict + +from TTS.tts.utils.text.phonemizers.base import BasePhonemizer +from TTS.tts.utils.text.belarusian.phonemizer import belarusian_text_to_phonemes + +_DEF_BE_PUNCS = ",!." # TODO + + +class BEL_Phonemizer(BasePhonemizer): + """🐸TTS be phonemizer using functions in `TTS.tts.utils.text.belarusian.phonemizer` + + Args: + punctuations (str): + Set of characters to be treated as punctuation. Defaults to `_DEF_BE_PUNCS`. + + keep_puncs (bool): + If True, keep the punctuations after phonemization. Defaults to False. + """ + + language = "be" + + def __init__(self, punctuations=_DEF_BE_PUNCS, keep_puncs=True, **kwargs): # pylint: disable=unused-argument + super().__init__(self.language, punctuations=punctuations, keep_puncs=keep_puncs) + + @staticmethod + def name(): + return "be_phonemizer" + + @staticmethod + def phonemize_be(text: str, separator: str = "|") -> str: # pylint: disable=unused-argument + return belarusian_text_to_phonemes(text) + + def _phonemize(self, text, separator): + return self.phonemize_be(text, separator) + + @staticmethod + def supported_languages() -> Dict: + return {"be": "Belarusian"} + + def version(self) -> str: + return "0.0.1" + + def is_available(self) -> bool: + return True + + +if __name__ == "__main__": + txt = "тэст" + e = BEL_Phonemizer() + print(e.supported_languages()) + print(e.version()) + print(e.language) + print(e.name()) + print(e.is_available()) + print("`" + e.phonemize(txt) + "`") diff --git a/recipes/bel-alex73/train_glowtts.py b/recipes/bel-alex73/train_glowtts.py index e0827cdc..24b62d79 100644 --- a/recipes/bel-alex73/train_glowtts.py +++ b/recipes/bel-alex73/train_glowtts.py @@ -60,7 +60,7 @@ config = GlowTTSConfig( output_path=output_path, add_blank=True, datasets=[dataset_config], - characters=characters, +# characters=characters, enable_eos_bos_chars=True, mixed_precision=False, save_step=10000, @@ -69,6 +69,8 @@ config = GlowTTSConfig( text_cleaner="no_cleaners", audio=audio_config, test_sentences=[], + use_phonemes=True, + phoneme_language="be", ) if __name__ == "__main__": diff --git a/tests/text_tests/test_belarusian_phonemizer.py b/tests/text_tests/test_belarusian_phonemizer.py new file mode 100644 index 00000000..278ee8be --- /dev/null +++ b/tests/text_tests/test_belarusian_phonemizer.py @@ -0,0 +1,29 @@ +import os +import warnings +import unittest + +from TTS.tts.utils.text.belarusian.phonemizer import belarusian_text_to_phonemes + +_TEST_CASES = """ +Фанетычны канвертар/fanʲɛˈtɨt͡ʂnɨ kanˈvʲɛrtar +Гэтак мы працавалі/ˈɣɛtak ˈmɨ prat͡saˈvalʲi +""" + + +class TestText(unittest.TestCase): + def test_belarusian_text_to_phonemes(self): + try: + os.environ["BEL_FANETYKA_JAR"] + except KeyError: + warnings.warn( + "You need to define 'BEL_FANETYKA_JAR' environment variable as path to the fanetyka.jar file to test Belarusian phonemizer", + Warning) + return + + for line in _TEST_CASES.strip().split("\n"): + text, phonemes = line.split("/") + self.assertEqual(belarusian_text_to_phonemes(text), phonemes) + + +if __name__ == "__main__": + unittest.main() From 32b8ebb6339bd5f5221de5ebbea27bd85756e4fa Mon Sep 17 00:00:00 2001 From: Unik <75041527+Exponefrv1@users.noreply.github.com> Date: Mon, 4 Sep 2023 12:39:19 +0300 Subject: [PATCH 4/6] Updated scipy version (#2914) --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 90d238b7..5712c2c8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ numpy==1.22.0;python_version<="3.10" numpy==1.24.3;python_version>"3.10" cython==0.29.30 -scipy>=1.4.0 +scipy>=1.11.2 torch>=1.7 torchaudio soundfile From d1d95707bd212b5d4fd113906b9c7ff6e2d0743b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 4 Sep 2023 12:28:36 +0200 Subject: [PATCH 5/6] Update docs (#2919) --- docs/requirements.txt | 2 +- docs/source/docker_images.md | 2 +- docs/source/implementing_a_new_model.md | 4 +--- docs/source/main_classes/model_api.md | 6 +++--- docs/source/models/bark.md | 6 ------ 5 files changed, 6 insertions(+), 14 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 195ba1d8..efbefec4 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,6 +1,6 @@ furo myst-parser == 2.0.0 -sphinx == 7.0.1 +sphinx == 7.2.5 sphinx_inline_tabs sphinx_copybutton linkify-it-py \ No newline at end of file diff --git a/docs/source/docker_images.md b/docs/source/docker_images.md index 55d54afd..d08a5583 100644 --- a/docs/source/docker_images.md +++ b/docs/source/docker_images.md @@ -43,7 +43,7 @@ Start the container and get a shell inside it. ```bash docker run --rm -it -p 5002:5002 --entrypoint /bin/bash ghcr.io/coqui-ai/tts-cpu python3 TTS/server/server.py --list_models #To get the list of available models -python3 TTS/server/server.py --model_name tts_models/en/vctk/vits +python3 TTS/server/server.py --model_name tts_models/en/vctk/vits ``` ### GPU version diff --git a/docs/source/implementing_a_new_model.md b/docs/source/implementing_a_new_model.md index 176c4865..134271ff 100644 --- a/docs/source/implementing_a_new_model.md +++ b/docs/source/implementing_a_new_model.md @@ -36,7 +36,7 @@ There is also the `callback` interface by which you can manipulate both the model and the `Trainer` states. Callbacks give you an infinite flexibility to add custom behaviours for your model and training routines. - For more details, see {ref}`BaseTTS ` and :obj:`TTS.utils.callbacks`. + For more details, see {ref}`BaseTTS ` and :obj:`TTS.utils.callbacks`. 6. Optionally, define `MyModelArgs`. @@ -204,5 +204,3 @@ class MyModel(BaseTTS): pass ``` - - diff --git a/docs/source/main_classes/model_api.md b/docs/source/main_classes/model_api.md index 6781a268..0e6f2d94 100644 --- a/docs/source/main_classes/model_api.md +++ b/docs/source/main_classes/model_api.md @@ -5,18 +5,18 @@ Model API provides you a set of functions that easily make your model compatible ## Base TTS Model ```{eval-rst} -.. autoclass:: TTS.model.BaseModel +.. autoclass:: TTS.model.BaseTrainerModel :members: ``` -## Base `tts` Model +## Base tts Model ```{eval-rst} .. autoclass:: TTS.tts.models.base_tts.BaseTTS :members: ``` -## Base `vocoder` Model +## Base vocoder Model ```{eval-rst} .. autoclass:: TTS.vocoder.models.base_vocoder.BaseVocoder diff --git a/docs/source/models/bark.md b/docs/source/models/bark.md index 978d793a..4092d9f4 100644 --- a/docs/source/models/bark.md +++ b/docs/source/models/bark.md @@ -91,12 +91,6 @@ tts --model_name tts_models/multilingual/multi-dataset/bark \ :members: ``` -## BarkArgs -```{eval-rst} -.. autoclass:: TTS.tts.models.bark.BarkArgs - :members: -``` - ## Bark Model ```{eval-rst} .. autoclass:: TTS.tts.models.bark.Bark From 40b527345fba1f38f41527fc9f2d6f66db5c121e Mon Sep 17 00:00:00 2001 From: Eren G??lge Date: Mon, 4 Sep 2023 12:51:53 +0200 Subject: [PATCH 6/6] Bump up to v0.16.6 --- TTS/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TTS/VERSION b/TTS/VERSION index 19270385..c3f65805 100644 --- a/TTS/VERSION +++ b/TTS/VERSION @@ -1 +1 @@ -0.16.5 +0.16.6