Merge pull request #2515 from coqui-ai/tts_cmd

🐸Studio models by `tts`
This commit is contained in:
Eren Gölge 2023-04-13 19:34:28 +02:00 committed by GitHub
commit e07c6f54fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 57 additions and 13 deletions

View File

@ -4,7 +4,7 @@ import os
import tempfile import tempfile
import urllib.request import urllib.request
from pathlib import Path from pathlib import Path
from typing import Tuple from typing import Tuple, Union
import numpy as np import numpy as np
import requests import requests
@ -86,7 +86,6 @@ class CS_API:
return ["Neutral", "Happy", "Sad", "Angry", "Dull"] return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
def _check_token(self): def _check_token(self):
self.ping_api()
if self.api_token is None: if self.api_token is None:
self.api_token = os.environ.get("COQUI_STUDIO_TOKEN") self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"} self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
@ -183,6 +182,7 @@ class CS_API:
language (str): Language of the text. If None, the default language of the speaker is used. language (str): Language of the text. If None, the default language of the speaker is used.
""" """
self._check_token() self._check_token()
self.ping_api()
if speaker_name is None and speaker_id is None: if speaker_name is None and speaker_id is None:
raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.") raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
if speaker_id is None: if speaker_id is None:
@ -457,7 +457,7 @@ class TTS:
emotion: str = "Neutral", emotion: str = "Neutral",
speed: float = 1.0, speed: float = 1.0,
file_path: str = None, file_path: str = None,
): ) -> Union[np.ndarray, str]:
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API. """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
Args: Args:
@ -473,9 +473,12 @@ class TTS:
Speed of the speech. Defaults to 1.0. Speed of the speech. Defaults to 1.0.
file_path (str, optional): file_path (str, optional):
Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None. Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
Returns:
Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
""" """
speaker_name = self.model_name.split("/")[2] speaker_name = self.model_name.split("/")[2]
if file_path is None: if file_path is not None:
return self.csapi.tts_to_file( return self.csapi.tts_to_file(
text=text, text=text,
speaker_name=speaker_name, speaker_name=speaker_name,

View File

@ -8,6 +8,7 @@ from argparse import RawTextHelpFormatter
# pylint: disable=redefined-outer-name, unused-argument # pylint: disable=redefined-outer-name, unused-argument
from pathlib import Path from pathlib import Path
from TTS.api import TTS
from TTS.utils.manage import ModelManager from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer from TTS.utils.synthesizer import Synthesizer
@ -183,6 +184,14 @@ If you don't specify any models, then it uses LJSpeech based English model.
) )
parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None) parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
# args for coqui studio
parser.add_argument(
"--emotion",
type=str,
help="Emotion to condition the model with. Only available for 🐸Coqui Studio models.",
default="Neutral",
)
# args for multi-speaker synthesis # args for multi-speaker synthesis
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None) parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None) parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
@ -285,6 +294,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
# load model manager # load model manager
path = Path(__file__).parent / "../.models.json" path = Path(__file__).parent / "../.models.json"
manager = ModelManager(path, progress_bar=args.progress_bar) manager = ModelManager(path, progress_bar=args.progress_bar)
api = TTS()
tts_path = None tts_path = None
tts_config_path = None tts_config_path = None
@ -299,6 +309,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
# CASE1 #list : list pre-trained TTS models # CASE1 #list : list pre-trained TTS models
if args.list_models: if args.list_models:
manager.add_cs_api_models(api.list_models())
manager.list_models() manager.list_models()
sys.exit() sys.exit()
@ -313,7 +324,15 @@ If you don't specify any models, then it uses LJSpeech based English model.
manager.model_info_by_full_name(model_query_full_name) manager.model_info_by_full_name(model_query_full_name)
sys.exit() sys.exit()
# CASE3: load pre-trained model paths # CASE3: TTS with coqui studio models
if "coqui_studio" in args.model_name:
print(" > Using 🐸Coqui Studio model: ", args.model_name)
api = TTS(model_name=args.model_name)
api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path)
print(" > Saving output to ", args.out_path)
return
# CASE4: load pre-trained model paths
if args.model_name is not None and not args.model_path: if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name) model_path, config_path, model_item = manager.download_model(args.model_name)
@ -333,7 +352,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
if args.vocoder_name is not None and not args.vocoder_path: if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
# CASE4: set custom model paths # CASE5: set custom model paths
if args.model_path is not None: if args.model_path is not None:
tts_path = args.model_path tts_path = args.model_path
tts_config_path = args.config_path tts_config_path = args.config_path

View File

@ -3,7 +3,7 @@ import os
import zipfile import zipfile
from pathlib import Path from pathlib import Path
from shutil import copyfile, rmtree from shutil import copyfile, rmtree
from typing import Dict, Tuple from typing import Dict, List, Tuple
import requests import requests
from tqdm import tqdm from tqdm import tqdm
@ -63,6 +63,28 @@ class ModelManager(object):
with open(file_path, "r", encoding="utf-8") as json_file: with open(file_path, "r", encoding="utf-8") as json_file:
self.models_dict = json.load(json_file) self.models_dict = json.load(json_file)
def add_cs_api_models(self, model_list: List[str]):
"""Add list of Coqui Studio model names that are returned from the api
Each has the following format `<coqui_studio_model>/en/<speaker_name>/<coqui_studio_model>`
"""
def _add_model(model_name: str):
if not "coqui_studio" in model_name:
return
model_type, lang, dataset, model = model_name.split("/")
if model_type not in self.models_dict:
self.models_dict[model_type] = {}
if lang not in self.models_dict[model_type]:
self.models_dict[model_type][lang] = {}
if dataset not in self.models_dict[model_type][lang]:
self.models_dict[model_type][lang][dataset] = {}
if model not in self.models_dict[model_type][lang][dataset]:
self.models_dict[model_type][lang][dataset][model] = {}
for model_name in model_list:
_add_model(model_name)
def _list_models(self, model_type, model_count=0): def _list_models(self, model_type, model_count=0):
if self.verbose: if self.verbose:
print(" Name format: type/language/dataset/model") print(" Name format: type/language/dataset/model")

View File

@ -19,9 +19,9 @@ def test_synthesize():
f'--text "This is an example." --out_path "{output_path}"' f'--text "This is an example." --out_path "{output_path}"'
) )
# multi-speaker SC-Glow model # 🐸 Coqui studio model
# run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs") run_cli(
# run_cli( 'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" '
# f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" ' '--text "This is it" '
# f'--text "This is an example." --out_path "{output_path}"' f'--out_path "{output_path}"'
# ) )