From 758ef84cc22cf24094853bc797b3ceddb0c48cba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 13 Apr 2023 14:14:41 +0200 Subject: [PATCH] =?UTF-8?q?Using=20=F0=9F=90=B8Studio=20models=20with=20`t?= =?UTF-8?q?ts`=20command?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TTS/api.py | 11 +++++++---- TTS/bin/synthesize.py | 24 ++++++++++++++++++++++-- TTS/utils/manage.py | 24 +++++++++++++++++++++++- tests/inference_tests/test_synthesize.py | 12 ++++++------ 4 files changed, 58 insertions(+), 13 deletions(-) diff --git a/TTS/api.py b/TTS/api.py index b0628743..4e0731de 100644 --- a/TTS/api.py +++ b/TTS/api.py @@ -4,7 +4,7 @@ import os import tempfile import urllib.request from pathlib import Path -from typing import Tuple +from typing import Tuple, Union import numpy as np import requests @@ -86,7 +86,6 @@ class CS_API: return ["Neutral", "Happy", "Sad", "Angry", "Dull"] def _check_token(self): - self.ping_api() if self.api_token is None: self.api_token = os.environ.get("COQUI_STUDIO_TOKEN") self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"} @@ -183,6 +182,7 @@ class CS_API: language (str): Language of the text. If None, the default language of the speaker is used. """ self._check_token() + self.ping_api() if speaker_name is None and speaker_id is None: raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.") if speaker_id is None: @@ -457,7 +457,7 @@ class TTS: emotion: str = "Neutral", speed: float = 1.0, file_path: str = None, - ): + ) -> Union[np.ndarray, str]: """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API. Args: @@ -473,9 +473,12 @@ class TTS: Speed of the speech. Defaults to 1.0. file_path (str, optional): Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None. + + Returns: + Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file. """ speaker_name = self.model_name.split("/")[2] - if file_path is None: + if file_path is not None: return self.csapi.tts_to_file( text=text, speaker_name=speaker_name, diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index 2877ea2b..fa49eeef 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -7,7 +7,9 @@ from argparse import RawTextHelpFormatter # pylint: disable=redefined-outer-name, unused-argument from pathlib import Path +from pprint import pprint +from TTS.api import TTS from TTS.utils.manage import ModelManager from TTS.utils.synthesizer import Synthesizer @@ -183,6 +185,14 @@ If you don't specify any models, then it uses LJSpeech based English model. ) parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None) + # args for coqui studio + parser.add_argument( + "--emotion", + type=str, + help="Emotion to condition the model with. Only available for 🐸Coqui Studio models.", + default="Neutral", + ) + # args for multi-speaker synthesis parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None) @@ -285,6 +295,7 @@ If you don't specify any models, then it uses LJSpeech based English model. # load model manager path = Path(__file__).parent / "../.models.json" manager = ModelManager(path, progress_bar=args.progress_bar) + api = TTS() tts_path = None tts_config_path = None @@ -299,6 +310,7 @@ If you don't specify any models, then it uses LJSpeech based English model. # CASE1 #list : list pre-trained TTS models if args.list_models: + manager.add_cs_api_models(api.list_models()) manager.list_models() sys.exit() @@ -313,7 +325,15 @@ If you don't specify any models, then it uses LJSpeech based English model. manager.model_info_by_full_name(model_query_full_name) sys.exit() - # CASE3: load pre-trained model paths + # CASE3: TTS with coqui studio models + if "coqui_studio" in args.model_name: + print(" > Using 🐸Coqui Studio model: ", args.model_name) + api = TTS(model_name=args.model_name) + api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path) + print(" > Saving output to ", args.out_path) + return + + # CASE4: load pre-trained model paths if args.model_name is not None and not args.model_path: model_path, config_path, model_item = manager.download_model(args.model_name) @@ -333,7 +353,7 @@ If you don't specify any models, then it uses LJSpeech based English model. if args.vocoder_name is not None and not args.vocoder_path: vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) - # CASE4: set custom model paths + # CASE5: set custom model paths if args.model_path is not None: tts_path = args.model_path tts_config_path = args.config_path diff --git a/TTS/utils/manage.py b/TTS/utils/manage.py index 8419429d..8bf13bcc 100644 --- a/TTS/utils/manage.py +++ b/TTS/utils/manage.py @@ -3,7 +3,7 @@ import os import zipfile from pathlib import Path from shutil import copyfile, rmtree -from typing import Dict, Tuple +from typing import Dict, List, Tuple import requests from tqdm import tqdm @@ -63,6 +63,28 @@ class ModelManager(object): with open(file_path, "r", encoding="utf-8") as json_file: self.models_dict = json.load(json_file) + def add_cs_api_models(self, model_list: List[str]): + """Add list of Coqui Studio model names that are returned from the api + + Each has the following format `/en//` + """ + + def _add_model(model_name: str): + if not "coqui_studio" in model_name: + return + model_type, lang, dataset, model = model_name.split("/") + if model_type not in self.models_dict: + self.models_dict[model_type] = {} + if lang not in self.models_dict[model_type]: + self.models_dict[model_type][lang] = {} + if dataset not in self.models_dict[model_type][lang]: + self.models_dict[model_type][lang][dataset] = {} + if model not in self.models_dict[model_type][lang][dataset]: + self.models_dict[model_type][lang][dataset][model] = {} + + for model_name in model_list: + _add_model(model_name) + def _list_models(self, model_type, model_count=0): if self.verbose: print(" Name format: type/language/dataset/model") diff --git a/tests/inference_tests/test_synthesize.py b/tests/inference_tests/test_synthesize.py index 42b77172..4bf751a5 100644 --- a/tests/inference_tests/test_synthesize.py +++ b/tests/inference_tests/test_synthesize.py @@ -19,9 +19,9 @@ def test_synthesize(): f'--text "This is an example." --out_path "{output_path}"' ) - # multi-speaker SC-Glow model - # run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs") - # run_cli( - # f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" ' - # f'--text "This is an example." --out_path "{output_path}"' - # ) + # 🐸 Coqui studio model + run_cli( + 'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" ' + '--text "This is it" ' + f'--out_path "{output_path}"' + )