Merge pull request #2519 from coqui-ai/dev

🌈 v0.13.2
This commit is contained in:
Eren Gölge 2023-04-14 10:47:23 +02:00 committed by GitHub
commit b3b4034c9d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 92 additions and 18 deletions

View File

@ -1 +1 @@
0.13.1
0.13.2

View File

@ -4,7 +4,7 @@ import os
import tempfile
import urllib.request
from pathlib import Path
from typing import Tuple
from typing import Tuple, Union
import numpy as np
import requests
@ -86,7 +86,6 @@ class CS_API:
return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
def _check_token(self):
self.ping_api()
if self.api_token is None:
self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
@ -183,6 +182,7 @@ class CS_API:
language (str): Language of the text. If None, the default language of the speaker is used.
"""
self._check_token()
self.ping_api()
if speaker_name is None and speaker_id is None:
raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
if speaker_id is None:
@ -457,7 +457,7 @@ class TTS:
emotion: str = "Neutral",
speed: float = 1.0,
file_path: str = None,
):
) -> Union[np.ndarray, str]:
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
Args:
@ -473,9 +473,12 @@ class TTS:
Speed of the speech. Defaults to 1.0.
file_path (str, optional):
Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
Returns:
Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
"""
speaker_name = self.model_name.split("/")[2]
if file_path is None:
if file_path is not None:
return self.csapi.tts_to_file(
text=text,
speaker_name=speaker_name,

View File

@ -1,12 +1,16 @@
import argparse
import glob
import multiprocessing
import os
import pathlib
import torch
from tqdm import tqdm
from TTS.utils.vad import get_vad_model_and_utils, remove_silence
torch.set_num_threads(1)
def adjust_path_and_remove_silence(audio_path):
output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
@ -44,10 +48,24 @@ def preprocess_audios():
# create threads
# num_threads = multiprocessing.cpu_count()
# process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
for f in tqdm(files):
output_path, is_speech = adjust_path_and_remove_silence(f)
if not is_speech:
filtered_files.append(output_path)
if args.num_processes > 1:
with multiprocessing.Pool(processes=args.num_processes) as pool:
results = list(
tqdm(
pool.imap_unordered(adjust_path_and_remove_silence, files),
total=len(files),
desc="Processing audio files",
)
)
for output_path, is_speech in results:
if not is_speech:
filtered_files.append(output_path)
else:
for f in tqdm(files):
output_path, is_speech = adjust_path_and_remove_silence(f)
if not is_speech:
filtered_files.append(output_path)
# write files that do not have speech
with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
@ -87,6 +105,18 @@ if __name__ == "__main__":
default=False,
help="If True use cuda",
)
parser.add_argument(
"--use_onnx",
type=bool,
default=False,
help="If True use onnx",
)
parser.add_argument(
"--num_processes",
type=int,
default=1,
help="Number of processes to use",
)
args = parser.parse_args()
# load the model and utils
model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)

View File

@ -8,6 +8,7 @@ from argparse import RawTextHelpFormatter
# pylint: disable=redefined-outer-name, unused-argument
from pathlib import Path
from TTS.api import TTS
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
@ -183,6 +184,14 @@ If you don't specify any models, then it uses LJSpeech based English model.
)
parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
# args for coqui studio
parser.add_argument(
"--emotion",
type=str,
help="Emotion to condition the model with. Only available for 🐸Coqui Studio models.",
default="Neutral",
)
# args for multi-speaker synthesis
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
@ -285,6 +294,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
# load model manager
path = Path(__file__).parent / "../.models.json"
manager = ModelManager(path, progress_bar=args.progress_bar)
api = TTS()
tts_path = None
tts_config_path = None
@ -299,6 +309,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
# CASE1 #list : list pre-trained TTS models
if args.list_models:
manager.add_cs_api_models(api.list_models())
manager.list_models()
sys.exit()
@ -313,7 +324,15 @@ If you don't specify any models, then it uses LJSpeech based English model.
manager.model_info_by_full_name(model_query_full_name)
sys.exit()
# CASE3: load pre-trained model paths
# CASE3: TTS with coqui studio models
if "coqui_studio" in args.model_name:
print(" > Using 🐸Coqui Studio model: ", args.model_name)
api = TTS(model_name=args.model_name)
api.tts_to_file(text=args.text, emotion=args.emotion, file_path=args.out_path)
print(" > Saving output to ", args.out_path)
return
# CASE4: load pre-trained model paths
if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name)
@ -333,7 +352,7 @@ If you don't specify any models, then it uses LJSpeech based English model.
if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
# CASE4: set custom model paths
# CASE5: set custom model paths
if args.model_path is not None:
tts_path = args.model_path
tts_config_path = args.config_path

View File

@ -3,7 +3,7 @@ import os
import zipfile
from pathlib import Path
from shutil import copyfile, rmtree
from typing import Dict, Tuple
from typing import Dict, List, Tuple
import requests
from tqdm import tqdm
@ -63,6 +63,28 @@ class ModelManager(object):
with open(file_path, "r", encoding="utf-8") as json_file:
self.models_dict = json.load(json_file)
def add_cs_api_models(self, model_list: List[str]):
"""Add list of Coqui Studio model names that are returned from the api
Each has the following format `<coqui_studio_model>/en/<speaker_name>/<coqui_studio_model>`
"""
def _add_model(model_name: str):
if not "coqui_studio" in model_name:
return
model_type, lang, dataset, model = model_name.split("/")
if model_type not in self.models_dict:
self.models_dict[model_type] = {}
if lang not in self.models_dict[model_type]:
self.models_dict[model_type][lang] = {}
if dataset not in self.models_dict[model_type][lang]:
self.models_dict[model_type][lang][dataset] = {}
if model not in self.models_dict[model_type][lang][dataset]:
self.models_dict[model_type][lang][dataset][model] = {}
for model_name in model_list:
_add_model(model_name)
def _list_models(self, model_type, model_count=0):
if self.verbose:
print(" Name format: type/language/dataset/model")

View File

@ -19,9 +19,9 @@ def test_synthesize():
f'--text "This is an example." --out_path "{output_path}"'
)
# multi-speaker SC-Glow model
# run_cli("tts --model_name tts_models/en/vctk/sc-glow-tts --list_speaker_idxs")
# run_cli(
# f'tts --model_name tts_models/en/vctk/sc-glow-tts --speaker_idx "p304" '
# f'--text "This is an example." --out_path "{output_path}"'
# )
# 🐸 Coqui studio model
run_cli(
'tts --model_name "coqui_studio/en/Torcull Diarmuid/coqui_studio" '
'--text "This is it" '
f'--out_path "{output_path}"'
)