From 1ddc484b49b26c1a3108cdbe0ddc826e19202df8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eren=20G=C3=B6lge?= <erogol@hotmail.com>
Date: Mon, 12 Dec 2022 12:04:20 +0100
Subject: [PATCH] Python API implementation (#2195)

* Draft implementation

* Fix style

* Add api tests

* Fix lint

* Update docs

* Update tests

* Set env

* Fixup

* Fixup

* Fix lint

* Revert
---
 .github/workflows/aux_tests.yml          |   2 +
 .github/workflows/data_tests.yml         |   2 +
 .github/workflows/inference_tests.yml    |   3 +
 .github/workflows/text_tests.yml         |   2 +
 .github/workflows/tts_tests.yml          |   2 +
 .github/workflows/vocoder_tests.yml      |   2 +
 .github/workflows/zoo_tests0.yml         |   2 +
 .github/workflows/zoo_tests1.yml         |   2 +
 .github/workflows/zoo_tests2.yml         |   2 +
 TTS/api.py                               | 146 +++++++++++++++++++++++
 TTS/utils/manage.py                      |  19 +--
 TTS/utils/synthesizer.py                 |   1 -
 docs/source/inference.md                 |  30 ++++-
 tests/inference_tests/test_python_api.py |  36 ++++++
 14 files changed, 240 insertions(+), 11 deletions(-)
 create mode 100644 TTS/api.py
 create mode 100644 tests/inference_tests/test_python_api.py

diff --git a/.github/workflows/aux_tests.yml b/.github/workflows/aux_tests.yml
index b40a661e..e42b964d 100644
--- a/.github/workflows/aux_tests.yml
+++ b/.github/workflows/aux_tests.yml
@@ -31,6 +31,8 @@ jobs:
           cache-dependency-path: 'requirements*'
       - name: check OS
         run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
       - name: Install dependencies
         run: |
           sudo apt-get update
diff --git a/.github/workflows/data_tests.yml b/.github/workflows/data_tests.yml
index f49c2e48..9ed1333d 100644
--- a/.github/workflows/data_tests.yml
+++ b/.github/workflows/data_tests.yml
@@ -31,6 +31,8 @@ jobs:
           cache-dependency-path: 'requirements*'
       - name: check OS
         run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
       - name: Install dependencies
         run: |
           sudo apt-get update
diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml
index a57a16df..0ff0857d 100644
--- a/.github/workflows/inference_tests.yml
+++ b/.github/workflows/inference_tests.yml
@@ -31,10 +31,13 @@ jobs:
           cache-dependency-path: 'requirements*'
       - name: check OS
         run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
       - name: Install dependencies
         run: |
           sudo apt-get update
           sudo apt-get install -y --no-install-recommends git make gcc
+          sudo apt-get install espeak-ng
           make system-deps
       - name: Install/upgrade Python setup deps
         run: python3 -m pip install --upgrade pip setuptools wheel
diff --git a/.github/workflows/text_tests.yml b/.github/workflows/text_tests.yml
index 8a46d051..9ae0a058 100644
--- a/.github/workflows/text_tests.yml
+++ b/.github/workflows/text_tests.yml
@@ -31,6 +31,8 @@ jobs:
           cache-dependency-path: 'requirements*'
       - name: check OS
         run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
       - name: Install dependencies
         run: |
           sudo apt-get update
diff --git a/.github/workflows/tts_tests.yml b/.github/workflows/tts_tests.yml
index 524bedce..6d35171e 100644
--- a/.github/workflows/tts_tests.yml
+++ b/.github/workflows/tts_tests.yml
@@ -31,6 +31,8 @@ jobs:
           cache-dependency-path: 'requirements*'
       - name: check OS
         run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
       - name: Install dependencies
         run: |
           sudo apt-get update
diff --git a/.github/workflows/vocoder_tests.yml b/.github/workflows/vocoder_tests.yml
index a8df2e71..cfa8e6af 100644
--- a/.github/workflows/vocoder_tests.yml
+++ b/.github/workflows/vocoder_tests.yml
@@ -31,6 +31,8 @@ jobs:
           cache-dependency-path: 'requirements*'
       - name: check OS
         run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
       - name: Install dependencies
         run: |
           sudo apt-get update
diff --git a/.github/workflows/zoo_tests0.yml b/.github/workflows/zoo_tests0.yml
index 01e1c400..d5f4cc14 100644
--- a/.github/workflows/zoo_tests0.yml
+++ b/.github/workflows/zoo_tests0.yml
@@ -31,6 +31,8 @@ jobs:
           cache-dependency-path: 'requirements*'
       - name: check OS
         run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
       - name: Install dependencies
         run: |
           sudo apt-get update
diff --git a/.github/workflows/zoo_tests1.yml b/.github/workflows/zoo_tests1.yml
index 1650aa12..7f15f977 100644
--- a/.github/workflows/zoo_tests1.yml
+++ b/.github/workflows/zoo_tests1.yml
@@ -31,6 +31,8 @@ jobs:
           cache-dependency-path: 'requirements*'
       - name: check OS
         run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
       - name: Install dependencies
         run: |
           sudo apt-get update
diff --git a/.github/workflows/zoo_tests2.yml b/.github/workflows/zoo_tests2.yml
index f5a155ab..9975a2cf 100644
--- a/.github/workflows/zoo_tests2.yml
+++ b/.github/workflows/zoo_tests2.yml
@@ -31,6 +31,8 @@ jobs:
           cache-dependency-path: 'requirements*'
       - name: check OS
         run: cat /etc/os-release
+      - name: set ENV
+        run: export TRAINER_TELEMETRY=0
       - name: Install dependencies
         run: |
           sudo apt-get update
diff --git a/TTS/api.py b/TTS/api.py
new file mode 100644
index 00000000..99c3e522
--- /dev/null
+++ b/TTS/api.py
@@ -0,0 +1,146 @@
+from pathlib import Path
+
+from TTS.utils.manage import ModelManager
+from TTS.utils.synthesizer import Synthesizer
+
+
+class TTS:
+    """TODO: Add voice conversion and Capacitron support."""
+
+    def __init__(self, model_name: str = None, progress_bar: bool = True, gpu=False):
+        """🐸TTS python interface that allows to load and use the released models.
+
+        Example with a multi-speaker model:
+            >>> from TTS.api import TTS
+            >>> tts = TTS(TTS.list_models()[0])
+            >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
+            >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
+
+        Example with a single-speaker model:
+            >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
+            >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
+
+        Args:
+            model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
+            progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
+            gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
+        """
+        self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
+        self.synthesizer = None
+        if model_name:
+            self.load_model_by_name(model_name, gpu)
+
+    @property
+    def models(self):
+        return self.manager.list_tts_models()
+
+    @property
+    def is_multi_speaker(self):
+        if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager:
+            return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
+        return False
+
+    @property
+    def is_multi_lingual(self):
+        if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
+            return self.synthesizer.tts_model.language_manager.num_languages > 1
+        return False
+
+    @property
+    def speakers(self):
+        if not self.is_multi_speaker:
+            return None
+        return self.synthesizer.tts_model.speaker_manager.speaker_names
+
+    @property
+    def languages(self):
+        if not self.is_multi_lingual:
+            return None
+        return self.synthesizer.tts_model.language_manager.language_names
+
+    @staticmethod
+    def get_models_file_path():
+        return Path(__file__).parent / ".models.json"
+
+    @staticmethod
+    def list_models():
+        manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
+        return manager.list_tts_models()
+
+    def download_model_by_name(self, model_name: str):
+        model_path, config_path, model_item = self.manager.download_model(model_name)
+        if model_item["default_vocoder"] is None:
+            return model_path, config_path, None, None
+        vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"])
+        return model_path, config_path, vocoder_path, vocoder_config_path
+
+    def load_model_by_name(self, model_name: str, gpu: bool = False):
+        model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
+        # init synthesizer
+        # None values are fetch from the model
+        self.synthesizer = Synthesizer(
+            tts_checkpoint=model_path,
+            tts_config_path=config_path,
+            tts_speakers_file=None,
+            tts_languages_file=None,
+            vocoder_checkpoint=vocoder_path,
+            vocoder_config=vocoder_config_path,
+            encoder_checkpoint=None,
+            encoder_config=None,
+            use_cuda=gpu,
+        )
+
+    def _check_arguments(self, speaker: str = None, language: str = None):
+        if self.is_multi_speaker and speaker is None:
+            raise ValueError("Model is multi-speaker but no speaker is provided.")
+        if self.is_multi_lingual and language is None:
+            raise ValueError("Model is multi-lingual but no language is provided.")
+        if not self.is_multi_speaker and speaker is not None:
+            raise ValueError("Model is not multi-speaker but speaker is provided.")
+        if not self.is_multi_lingual and language is not None:
+            raise ValueError("Model is not multi-lingual but language is provided.")
+
+    def tts(self, text: str, speaker: str = None, language: str = None):
+        """Convert text to speech.
+
+        Args:
+            text (str):
+                Input text to synthesize.
+            speaker (str, optional):
+                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+            language (str, optional):
+                Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+                `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+        """
+        self._check_arguments(speaker=speaker, language=language)
+
+        wav = self.synthesizer.tts(
+            text=text,
+            speaker_name=speaker,
+            language_name=language,
+            speaker_wav=None,
+            reference_wav=None,
+            style_wav=None,
+            style_text=None,
+            reference_speaker_name=None,
+        )
+        return wav
+
+    def tts_to_file(self, text: str, speaker: str = None, language: str = None, file_path: str = "output.wav"):
+        """Convert text to speech.
+
+        Args:
+            text (str):
+                Input text to synthesize.
+            speaker (str, optional):
+                Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
+                `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
+            language (str, optional):
+                Language code for multi-lingual models. You can check whether loaded model is multi-lingual
+                `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
+            file_path (str, optional):
+                Output file path. Defaults to "output.wav".
+        """
+        wav = self.tts(text=text, speaker=speaker, language=language)
+        self.synthesizer.save_wav(wav=wav, path=file_path)
diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py
index 645099e0..babe54f3 100644
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@@ -35,11 +35,13 @@ class ModelManager(object):
         models_file (str): path to .model.json file. Defaults to None.
         output_prefix (str): prefix to `tts` to download models. Defaults to None
         progress_bar (bool): print a progress bar when donwloading a file. Defaults to False.
+        verbose (bool): print info. Defaults to True.
     """
 
-    def __init__(self, models_file=None, output_prefix=None, progress_bar=False):
+    def __init__(self, models_file=None, output_prefix=None, progress_bar=False, verbose=True):
         super().__init__()
         self.progress_bar = progress_bar
+        self.verbose = verbose
         if output_prefix is None:
             self.output_prefix = get_user_data_dir("tts")
         else:
@@ -62,30 +64,31 @@ class ModelManager(object):
             self.models_dict = json.load(json_file)
 
     def _list_models(self, model_type, model_count=0):
+        if self.verbose:
+            print(" Name format: type/language/dataset/model")
         model_list = []
         for lang in self.models_dict[model_type]:
             for dataset in self.models_dict[model_type][lang]:
                 for model in self.models_dict[model_type][lang][dataset]:
                     model_full_name = f"{model_type}--{lang}--{dataset}--{model}"
                     output_path = os.path.join(self.output_prefix, model_full_name)
-                    if os.path.exists(output_path):
-                        print(f" {model_count}: {model_type}/{lang}/{dataset}/{model} [already downloaded]")
-                    else:
-                        print(f" {model_count}: {model_type}/{lang}/{dataset}/{model}")
+                    if self.verbose:
+                        if os.path.exists(output_path):
+                            print(f" {model_count}: {model_type}/{lang}/{dataset}/{model} [already downloaded]")
+                        else:
+                            print(f" {model_count}: {model_type}/{lang}/{dataset}/{model}")
                     model_list.append(f"{model_type}/{lang}/{dataset}/{model}")
                     model_count += 1
         return model_list
 
     def _list_for_model_type(self, model_type):
-        print(" Name format: language/dataset/model")
         models_name_list = []
         model_count = 1
         model_type = "tts_models"
         models_name_list.extend(self._list_models(model_type, model_count))
-        return [name.replace(model_type + "/", "") for name in models_name_list]
+        return models_name_list
 
     def list_models(self):
-        print(" Name format: type/language/dataset/model")
         models_name_list = []
         model_count = 1
         for model_type in self.models_dict:
diff --git a/TTS/utils/synthesizer.py b/TTS/utils/synthesizer.py
index bc3fc0aa..4a0ab038 100644
--- a/TTS/utils/synthesizer.py
+++ b/TTS/utils/synthesizer.py
@@ -62,7 +62,6 @@ class Synthesizer(object):
         self.tts_model = None
         self.vocoder_model = None
         self.speaker_manager = None
-        self.num_speakers = 0
         self.tts_speakers = {}
         self.language_manager = None
         self.num_languages = 0
diff --git a/docs/source/inference.md b/docs/source/inference.md
index 1057d04d..d7d63a69 100644
--- a/docs/source/inference.md
+++ b/docs/source/inference.md
@@ -11,6 +11,7 @@ After the installation, 2 terminal commands are available.
 
 1. TTS Command Line Interface (CLI). - `tts`
 2. Local Demo Server. - `tts-server`
+3. In 🐍Python. - `from TTS.api import TTS`
 
 ## On the Commandline - `tts`
 ![cli.gif](https://github.com/coqui-ai/TTS/raw/main/images/tts_cli.gif)
@@ -99,5 +100,30 @@ tts-server --model_name "<type>/<language>/<dataset>/<model_name>" \
            --vocoder_name "<type>/<language>/<dataset>/<model_name>"
 ```
 
-## TorchHub
-You can also use [this simple colab notebook](https://colab.research.google.com/drive/1iAe7ZdxjUIuN6V4ooaCt0fACEGKEn7HW?usp=sharing) using TorchHub to synthesize speech.
\ No newline at end of file
+## Python API
+
+You can run a multi-speaker and multi-lingual model in Python as
+
+```python
+from TTS.api import TTS
+
+# List available 🐸TTS models and choose the first one
+model_name = TTS.list_models()[0]
+# Init TTS
+tts = TTS(model_name)
+# Run TTS
+# ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language
+# Text to speech with a numpy output
+wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
+# Text to speech to a file
+tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
+```
+
+Here is an example for a single speaker model.
+
+```python
+# Init TTS with the target model name
+tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
+# Run TTS
+tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
+```
\ No newline at end of file
diff --git a/tests/inference_tests/test_python_api.py b/tests/inference_tests/test_python_api.py
new file mode 100644
index 00000000..fdd7e1cb
--- /dev/null
+++ b/tests/inference_tests/test_python_api.py
@@ -0,0 +1,36 @@
+import os
+import unittest
+
+from tests import get_tests_output_path
+from TTS.api import TTS
+
+OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav")
+
+
+class TTSTest(unittest.TestCase):
+    def test_single_speaker_model(self):
+        tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
+
+        error_raised = False
+        try:
+            tts.tts_to_file(text="Ich bin eine Testnachricht.", speaker="Thorsten", language="de")
+        except ValueError:
+            error_raised = True
+
+        tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path=OUTPUT_PATH)
+
+        self.assertTrue(error_raised)
+        self.assertFalse(tts.is_multi_speaker)
+        self.assertFalse(tts.is_multi_lingual)
+        self.assertIsNone(tts.speakers)
+        self.assertIsNone(tts.languages)
+
+    def test_multi_speaker_multi_lingual_model(self):
+        tts = TTS()
+        tts.load_model_by_name(tts.models[0])  # YourTTS
+        tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path=OUTPUT_PATH)
+
+        self.assertTrue(tts.is_multi_speaker)
+        self.assertTrue(tts.is_multi_lingual)
+        self.assertGreater(len(tts.speakers), 1)
+        self.assertGreater(len(tts.languages), 1)