Merge pull request #2515 from coqui-ai/tts_cmd

🐸Studio models by `tts`
2023-04-13 19:34:28 +02:00 · 2023-04-13 19:34:28 +02:00 · e07c6f54fd
parent e33e7170ed c9375e4b8b
commit e07c6f54fd
4 changed files with 57 additions and 13 deletions
--- a/TTS/api.py
+++ b/TTS/api.py
@ -4,7 +4,7 @@ import os
 import tempfile
 import urllib.request
 from pathlib import Path
-from typing import Tuple
+from typing import Tuple, Union
 import numpy as np
 import requests
@ -86,7 +86,6 @@ class CS_API:
        return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
    def _check_token(self):
        self.ping_api()
        if self.api_token is None:
            self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
            self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
@ -183,6 +182,7 @@ class CS_API:
            language (str): Language of the text. If None, the default language of the speaker is used.
        """
        self._check_token()
        self.ping_api()
        if speaker_name is None and speaker_id is None:
            raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
        if speaker_id is None:
@ -457,7 +457,7 @@ class TTS:
        emotion: str = "Neutral",
        speed: float = 1.0,
        file_path: str = None,
-    ):
+    ) -> Union[np.ndarray, str]:
        """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
        Args:
@ -473,9 +473,12 @@ class TTS:
                Speed of the speech. Defaults to 1.0.
            file_path (str, optional):
                Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
        Returns:
            Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
        """
        speaker_name = self.model_name.split("/")[2]
-        if file_path is None:
+        if file_path is not None:
            return self.csapi.tts_to_file(
                text=text,
                speaker_name=speaker_name,
--- a/TTS/bin/synthesize.py
+++ b/TTS/bin/synthesize.py
@ -8,6 +8,7 @@ from argparse import RawTextHelpFormatter
 # pylint: disable=redefined-outer-name, unused-argument
 from pathlib import Path
 from TTS.api import TTS
 from TTS.utils.manage import ModelManager
 from TTS.utils.synthesizer import Synthesizer
@ -183,6 +184,14 @@ If you don't specify any models, then it uses LJSpeech based English model.
    )
    parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
    # args for coqui studio
    parser.add_argument(
        "--emotion",
        type=str,
        help="Emotion to condition the model with. Only available for 🐸Coqui Studio models.",
        default="Neutral",
    )
    # args for multi-speaker synthesis
    parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
    parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
@ -285,6 +294,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
    # load model manager
    path = Path(__file__).parent / "../.models.json"
    manager = ModelManager(path, progress_bar=args.progress_bar)
    api = TTS()
    tts_path = None
    tts_config_path = None
@ -299,6 +309,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
    # CASE1 #list : list pre-trained TTS models
    if args.list_models:
        manager.add_cs_api_models(api.list_models())
        manager.list_models()
        sys.exit()
@ -313,7 +324,15 @@ If you don't specify any models, then it uses LJSpeech based English model.
        manager.model_info_by_full_name(model_query_full_name)
        sys.exit()
-    # CASE3: load pre-trained model paths
+    # CASE3: TTS with coqui studio models
    if "coqui_studio" in args.model_name:
        print(" > Using 🐸Coqui Studio model: ", args.model_name)
        api = TTS(model_name=args.model_name)
        api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path)
        print(" > Saving output to ", args.out_path)
        return
    # CASE4: load pre-trained model paths
    if args.model_name is not None and not args.model_path:
        model_path, config_path, model_item = manager.download_model(args.model_name)
@ -333,7 +352,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
    if args.vocoder_name is not None and not args.vocoder_path:
        vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
-    # CASE4: set custom model paths
+    # CASE5: set custom model paths
    if args.model_path is not None:
        tts_path = args.model_path
        tts_config_path = args.config_path
--- a/TTS/utils/manage.py
+++ b/TTS/utils/manage.py
@ -3,7 +3,7 @@ import os
 import zipfile
 from pathlib import Path
 from shutil import copyfile, rmtree
-from typing import Dict, Tuple
+from typing import Dict, List, Tuple
 import requests
 from tqdm import tqdm
@ -63,6 +63,28 @@ class ModelManager(object):
        with open(file_path, "r", encoding="utf-8") as json_file:
            self.models_dict = json.load(json_file)
    def add_cs_api_models(self, model_list: List[str]):
        """Add list of Coqui Studio model names that are returned from the api
        Each has the following format `<coqui_studio_model>/en/<speaker_name>/<coqui_studio_model>`
        """
        def _add_model(model_name: str):
            if not "coqui_studio" in model_name:
                return
            model_type, lang, dataset, model = model_name.split("/")
            if model_type not in self.models_dict:
                self.models_dict[model_type] = {}
            if lang not in self.models_dict[model_type]:
                self.models_dict[model_type][lang] = {}
            if dataset not in self.models_dict[model_type][lang]:
                self.models_dict[model_type][lang][dataset] = {}
            if model not in self.models_dict[model_type][lang][dataset]:
                self.models_dict[model_type][lang][dataset][model] = {}
        for model_name in model_list:
            _add_model(model_name)
    def _list_models(self, model_type, model_count=0):
        if self.verbose:
            print(" Name format: type/language/dataset/model")
--- a/tests/inference_tests/test_synthesize.py
+++ b/tests/inference_tests/test_synthesize.py
@ -19,9 +19,9 @@ def test_synthesize():
        f'--text "This is an example." --out_path "{output_path}"'
    )
-    # multi-speaker SC-Glow model
+    # 🐸 Coqui studio model
-    # run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs")
+    run_cli(
-    # run_cli(
+        'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" '
-    #     f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" '
+        '--text "This is it" '
-    #     f'--text "This is an example." --out_path "{output_path}"'
+        f'--out_path "{output_path}"'
-    # )
+    )