From 1ddc484b49b26c1a3108cdbe0ddc826e19202df8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Mon, 12 Dec 2022 12:04:20 +0100 Subject: [PATCH] Python API implementation (#2195) * Draft implementation * Fix style * Add api tests * Fix lint * Update docs * Update tests * Set env * Fixup * Fixup * Fix lint * Revert --- .github/workflows/aux_tests.yml | 2 + .github/workflows/data_tests.yml | 2 + .github/workflows/inference_tests.yml | 3 + .github/workflows/text_tests.yml | 2 + .github/workflows/tts_tests.yml | 2 + .github/workflows/vocoder_tests.yml | 2 + .github/workflows/zoo_tests0.yml | 2 + .github/workflows/zoo_tests1.yml | 2 + .github/workflows/zoo_tests2.yml | 2 + TTS/api.py | 146 +++++++++++++++++++++++ TTS/utils/manage.py | 19 +-- TTS/utils/synthesizer.py | 1 - docs/source/inference.md | 30 ++++- tests/inference_tests/test_python_api.py | 36 ++++++ 14 files changed, 240 insertions(+), 11 deletions(-) create mode 100644 TTS/api.py create mode 100644 tests/inference_tests/test_python_api.py diff --git a/.github/workflows/aux_tests.yml b/.github/workflows/aux_tests.yml index b40a661e..e42b964d 100644 --- a/.github/workflows/aux_tests.yml +++ b/.github/workflows/aux_tests.yml @@ -31,6 +31,8 @@ jobs: cache-dependency-path: 'requirements*' - name: check OS run: cat /etc/os-release + - name: set ENV + run: export TRAINER_TELEMETRY=0 - name: Install dependencies run: | sudo apt-get update diff --git a/.github/workflows/data_tests.yml b/.github/workflows/data_tests.yml index f49c2e48..9ed1333d 100644 --- a/.github/workflows/data_tests.yml +++ b/.github/workflows/data_tests.yml @@ -31,6 +31,8 @@ jobs: cache-dependency-path: 'requirements*' - name: check OS run: cat /etc/os-release + - name: set ENV + run: export TRAINER_TELEMETRY=0 - name: Install dependencies run: | sudo apt-get update diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml index a57a16df..0ff0857d 100644 --- a/.github/workflows/inference_tests.yml +++ b/.github/workflows/inference_tests.yml @@ -31,10 +31,13 @@ jobs: cache-dependency-path: 'requirements*' - name: check OS run: cat /etc/os-release + - name: set ENV + run: export TRAINER_TELEMETRY=0 - name: Install dependencies run: | sudo apt-get update sudo apt-get install -y --no-install-recommends git make gcc + sudo apt-get install espeak-ng make system-deps - name: Install/upgrade Python setup deps run: python3 -m pip install --upgrade pip setuptools wheel diff --git a/.github/workflows/text_tests.yml b/.github/workflows/text_tests.yml index 8a46d051..9ae0a058 100644 --- a/.github/workflows/text_tests.yml +++ b/.github/workflows/text_tests.yml @@ -31,6 +31,8 @@ jobs: cache-dependency-path: 'requirements*' - name: check OS run: cat /etc/os-release + - name: set ENV + run: export TRAINER_TELEMETRY=0 - name: Install dependencies run: | sudo apt-get update diff --git a/.github/workflows/tts_tests.yml b/.github/workflows/tts_tests.yml index 524bedce..6d35171e 100644 --- a/.github/workflows/tts_tests.yml +++ b/.github/workflows/tts_tests.yml @@ -31,6 +31,8 @@ jobs: cache-dependency-path: 'requirements*' - name: check OS run: cat /etc/os-release + - name: set ENV + run: export TRAINER_TELEMETRY=0 - name: Install dependencies run: | sudo apt-get update diff --git a/.github/workflows/vocoder_tests.yml b/.github/workflows/vocoder_tests.yml index a8df2e71..cfa8e6af 100644 --- a/.github/workflows/vocoder_tests.yml +++ b/.github/workflows/vocoder_tests.yml @@ -31,6 +31,8 @@ jobs: cache-dependency-path: 'requirements*' - name: check OS run: cat /etc/os-release + - name: set ENV + run: export TRAINER_TELEMETRY=0 - name: Install dependencies run: | sudo apt-get update diff --git a/.github/workflows/zoo_tests0.yml b/.github/workflows/zoo_tests0.yml index 01e1c400..d5f4cc14 100644 --- a/.github/workflows/zoo_tests0.yml +++ b/.github/workflows/zoo_tests0.yml @@ -31,6 +31,8 @@ jobs: cache-dependency-path: 'requirements*' - name: check OS run: cat /etc/os-release + - name: set ENV + run: export TRAINER_TELEMETRY=0 - name: Install dependencies run: | sudo apt-get update diff --git a/.github/workflows/zoo_tests1.yml b/.github/workflows/zoo_tests1.yml index 1650aa12..7f15f977 100644 --- a/.github/workflows/zoo_tests1.yml +++ b/.github/workflows/zoo_tests1.yml @@ -31,6 +31,8 @@ jobs: cache-dependency-path: 'requirements*' - name: check OS run: cat /etc/os-release + - name: set ENV + run: export TRAINER_TELEMETRY=0 - name: Install dependencies run: | sudo apt-get update diff --git a/.github/workflows/zoo_tests2.yml b/.github/workflows/zoo_tests2.yml index f5a155ab..9975a2cf 100644 --- a/.github/workflows/zoo_tests2.yml +++ b/.github/workflows/zoo_tests2.yml @@ -31,6 +31,8 @@ jobs: cache-dependency-path: 'requirements*' - name: check OS run: cat /etc/os-release + - name: set ENV + run: export TRAINER_TELEMETRY=0 - name: Install dependencies run: | sudo apt-get update diff --git a/TTS/api.py b/TTS/api.py new file mode 100644 index 00000000..99c3e522 --- /dev/null +++ b/TTS/api.py @@ -0,0 +1,146 @@ +from pathlib import Path + +from TTS.utils.manage import ModelManager +from TTS.utils.synthesizer import Synthesizer + + +class TTS: + """TODO: Add voice conversion and Capacitron support.""" + + def __init__(self, model_name: str = None, progress_bar: bool = True, gpu=False): + """🐸TTS python interface that allows to load and use the released models. + + Example with a multi-speaker model: + >>> from TTS.api import TTS + >>> tts = TTS(TTS.list_models()[0]) + >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0]) + >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav") + + Example with a single-speaker model: + >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False) + >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav") + + Args: + model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None. + progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. + gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. + """ + self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False) + self.synthesizer = None + if model_name: + self.load_model_by_name(model_name, gpu) + + @property + def models(self): + return self.manager.list_tts_models() + + @property + def is_multi_speaker(self): + if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager: + return self.synthesizer.tts_model.speaker_manager.num_speakers > 1 + return False + + @property + def is_multi_lingual(self): + if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager: + return self.synthesizer.tts_model.language_manager.num_languages > 1 + return False + + @property + def speakers(self): + if not self.is_multi_speaker: + return None + return self.synthesizer.tts_model.speaker_manager.speaker_names + + @property + def languages(self): + if not self.is_multi_lingual: + return None + return self.synthesizer.tts_model.language_manager.language_names + + @staticmethod + def get_models_file_path(): + return Path(__file__).parent / ".models.json" + + @staticmethod + def list_models(): + manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False) + return manager.list_tts_models() + + def download_model_by_name(self, model_name: str): + model_path, config_path, model_item = self.manager.download_model(model_name) + if model_item["default_vocoder"] is None: + return model_path, config_path, None, None + vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"]) + return model_path, config_path, vocoder_path, vocoder_config_path + + def load_model_by_name(self, model_name: str, gpu: bool = False): + model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name) + # init synthesizer + # None values are fetch from the model + self.synthesizer = Synthesizer( + tts_checkpoint=model_path, + tts_config_path=config_path, + tts_speakers_file=None, + tts_languages_file=None, + vocoder_checkpoint=vocoder_path, + vocoder_config=vocoder_config_path, + encoder_checkpoint=None, + encoder_config=None, + use_cuda=gpu, + ) + + def _check_arguments(self, speaker: str = None, language: str = None): + if self.is_multi_speaker and speaker is None: + raise ValueError("Model is multi-speaker but no speaker is provided.") + if self.is_multi_lingual and language is None: + raise ValueError("Model is multi-lingual but no language is provided.") + if not self.is_multi_speaker and speaker is not None: + raise ValueError("Model is not multi-speaker but speaker is provided.") + if not self.is_multi_lingual and language is not None: + raise ValueError("Model is not multi-lingual but language is provided.") + + def tts(self, text: str, speaker: str = None, language: str = None): + """Convert text to speech. + + Args: + text (str): + Input text to synthesize. + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. + language (str, optional): + Language code for multi-lingual models. You can check whether loaded model is multi-lingual + `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None. + """ + self._check_arguments(speaker=speaker, language=language) + + wav = self.synthesizer.tts( + text=text, + speaker_name=speaker, + language_name=language, + speaker_wav=None, + reference_wav=None, + style_wav=None, + style_text=None, + reference_speaker_name=None, + ) + return wav + + def tts_to_file(self, text: str, speaker: str = None, language: str = None, file_path: str = "output.wav"): + """Convert text to speech. + + Args: + text (str): + Input text to synthesize. + speaker (str, optional): + Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by + `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. + language (str, optional): + Language code for multi-lingual models. You can check whether loaded model is multi-lingual + `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None. + file_path (str, optional): + Output file path. Defaults to "output.wav". + """ + wav = self.tts(text=text, speaker=speaker, language=language) + self.synthesizer.save_wav(wav=wav, path=file_path) diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 645099e0..babe54f3 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -35,11 +35,13 @@ class ModelManager(object): models_file (str): path to .model.json file. Defaults to None. output_prefix (str): prefix to `tts` to download models. Defaults to None progress_bar (bool): print a progress bar when donwloading a file. Defaults to False. + verbose (bool): print info. Defaults to True. """ - def __init__(self, models_file=None, output_prefix=None, progress_bar=False): + def __init__(self, models_file=None, output_prefix=None, progress_bar=False, verbose=True): super().__init__() self.progress_bar = progress_bar + self.verbose = verbose if output_prefix is None: self.output_prefix = get_user_data_dir("tts") else: @@ -62,30 +64,31 @@ class ModelManager(object): self.models_dict = json.load(json_file) def _list_models(self, model_type, model_count=0): + if self.verbose: + print(" Name format: type/language/dataset/model") model_list = [] for lang in self.models_dict[model_type]: for dataset in self.models_dict[model_type][lang]: for model in self.models_dict[model_type][lang][dataset]: model_full_name = f"{model_type}--{lang}--{dataset}--{model}" output_path = os.path.join(self.output_prefix, model_full_name) - if os.path.exists(output_path): - print(f" {model_count}: {model_type}/{lang}/{dataset}/{model} [already downloaded]") - else: - print(f" {model_count}: {model_type}/{lang}/{dataset}/{model}") + if self.verbose: + if os.path.exists(output_path): + print(f" {model_count}: {model_type}/{lang}/{dataset}/{model} [already downloaded]") + else: + print(f" {model_count}: {model_type}/{lang}/{dataset}/{model}") model_list.append(f"{model_type}/{lang}/{dataset}/{model}") model_count += 1 return model_list def _list_for_model_type(self, model_type): - print(" Name format: language/dataset/model") models_name_list = [] model_count = 1 model_type = "tts_models" models_name_list.extend(self._list_models(model_type, model_count)) - return [name.replace(model_type + "/", "") for name in models_name_list] + return models_name_list def list_models(self): - print(" Name format: type/language/dataset/model") models_name_list = [] model_count = 1 for model_type in self.models_dict: diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py index bc3fc0aa..4a0ab038 100644 --- a/TTS/utils/synthesizer.py +++ b/TTS/utils/synthesizer.py @@ -62,7 +62,6 @@ class Synthesizer(object): self.tts_model = None self.vocoder_model = None self.speaker_manager = None - self.num_speakers = 0 self.tts_speakers = {} self.language_manager = None self.num_languages = 0 diff --git a/docs/source/inference.md b/docs/source/inference.md index 1057d04d..d7d63a69 100644 --- a/docs/source/inference.md +++ b/docs/source/inference.md @@ -11,6 +11,7 @@ After the installation, 2 terminal commands are available. 1. TTS Command Line Interface (CLI). - `tts` 2. Local Demo Server. - `tts-server` +3. In 🐍Python. - `from TTS.api import TTS` ## On the Commandline - `tts` ![cli.gif](https://github.com/coqui-ai/TTS/raw/main/images/tts_cli.gif) @@ -99,5 +100,30 @@ tts-server --model_name "///" \ --vocoder_name "///" ``` -## TorchHub -You can also use [this simple colab notebook](https://colab.research.google.com/drive/1iAe7ZdxjUIuN6V4ooaCt0fACEGKEn7HW?usp=sharing) using TorchHub to synthesize speech. \ No newline at end of file +## Python API + +You can run a multi-speaker and multi-lingual model in Python as + +```python +from TTS.api import TTS + +# List available 🐸TTS models and choose the first one +model_name = TTS.list_models()[0] +# Init TTS +tts = TTS(model_name) +# Run TTS +# ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language +# Text to speech with a numpy output +wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0]) +# Text to speech to a file +tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav") +``` + +Here is an example for a single speaker model. + +```python +# Init TTS with the target model name +tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False) +# Run TTS +tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH) +``` \ No newline at end of file diff --git a/tests/inference_tests/test_python_api.py b/tests/inference_tests/test_python_api.py new file mode 100644 index 00000000..fdd7e1cb --- /dev/null +++ b/tests/inference_tests/test_python_api.py @@ -0,0 +1,36 @@ +import os +import unittest + +from tests import get_tests_output_path +from TTS.api import TTS + +OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav") + + +class TTSTest(unittest.TestCase): + def test_single_speaker_model(self): + tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False) + + error_raised = False + try: + tts.tts_to_file(text="Ich bin eine Testnachricht.", speaker="Thorsten", language="de") + except ValueError: + error_raised = True + + tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH) + + self.assertTrue(error_raised) + self.assertFalse(tts.is_multi_speaker) + self.assertFalse(tts.is_multi_lingual) + self.assertIsNone(tts.speakers) + self.assertIsNone(tts.languages) + + def test_multi_speaker_multi_lingual_model(self): + tts = TTS() + tts.load_model_by_name(tts.models[0]) # YourTTS + tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path=OUTPUT_PATH) + + self.assertTrue(tts.is_multi_speaker) + self.assertTrue(tts.is_multi_lingual) + self.assertGreater(len(tts.speakers), 1) + self.assertGreater(len(tts.languages), 1)