🐸 Coqui Studio API integration (#2484)

* Warn when lang is not avail

* Make style

* Implement Coqui Studio API

* Test

* Update docs

* Set action

* Make style

* Make lint

* Update README

* Make style

* Fix action

* Run actions
This commit is contained in:
Eren Gölge 2023-04-05 15:06:50 +02:00 committed by GitHub
parent ce79160576
commit ad8b9bf2be
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 482 additions and 31 deletions

View File

@ -32,7 +32,8 @@ jobs:
- name: check OS - name: check OS
run: cat /etc/os-release run: cat /etc/os-release
- name: set ENV - name: set ENV
run: export TRAINER_TELEMETRY=0 run: |
export TRAINER_TELEMETRY=0
- name: Install dependencies - name: Install dependencies
run: | run: |
sudo apt-get update sudo apt-get update
@ -49,4 +50,6 @@ jobs:
python3 -m pip install .[all] python3 -m pip install .[all]
python3 setup.py egg_info python3 setup.py egg_info
- name: Unit tests - name: Unit tests
run: make inference_tests run: |
export COQUI_STUDIO_TOKEN=${{ secrets.COQUI_STUDIO_TOKEN }}
make inference_tests

View File

@ -197,6 +197,36 @@ tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_
tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav") tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="output.wav")
tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav") tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr-fr", file_path="output.wav")
tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav") tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt-br", file_path="output.wav")
# Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu=True)
tts.voice_conversion_to_file(source_wav="my/source.wav", target_wav="my/target.wav", file_path="output.wav")
# Example voice cloning by a single speaker TTS model combining with the voice conversion model. This way, you can
# clone voices by using any model in 🐸TTS.
tts = TTS("tts_models/de/thorsten/tacotron2-DDC")
tts.tts_with_vc_to_file(
"Wie sage ich auf Italienisch, dass ich dich liebe?",
speaker_wav="target/speaker.wav",
file_path="ouptut.wav"
)
# Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. You can use all of your available speakers in the studio.
# [🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
# You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
# If you have a valid API token set you will see the studio speakers as separate models in the list.
# The name format is coqui_studio/en/<studio_speaker_name>/coqui_studio
models = TTS().list_models()
# Init TTS with the target studio speaker
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False)
# Run TTS
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
# Run TTS with emotion and speed control
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
``` ```
### Command line `tts` ### Command line `tts`

View File

@ -1,11 +1,227 @@
import http.client
import json
import os
import tempfile import tempfile
import urllib.request
from pathlib import Path from pathlib import Path
from typing import Tuple
import numpy as np
from scipy.io import wavfile
from TTS.utils.audio.numpy_transforms import save_wav from TTS.utils.audio.numpy_transforms import save_wav
from TTS.utils.manage import ModelManager from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer from TTS.utils.synthesizer import Synthesizer
class Speaker(object):
"""Convert dict to object."""
def __init__(self, d, is_voice=False):
self.is_voice = is_voice
for k, v in d.items():
if isinstance(k, (list, tuple)):
setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v])
else:
setattr(self, k, Speaker(v) if isinstance(v, dict) else v)
def __repr__(self):
return str(self.__dict__)
class CS_API:
"""🐸Coqui Studio API Wrapper.
🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice
interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different
characteristics. You can use these voices to generate new audio files or use them in your applications.
You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token.
You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from
https://app.coqui.ai/account. We can either enter the token as an environment variable as
`export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`.
Visit https://app.coqui.ai/api for more information.
Example listing all available speakers:
>>> from TTS.api import CS_API
>>> tts = CS_API()
>>> tts.speakers
Example listing all emotions:
>>> from TTS.api import CS_API
>>> tts = CS_API()
>>> tts.emotions
Example with a built-in 🐸 speaker:
>>> from TTS.api import CS_API
>>> tts = CS_API()
>>> wav, sr = api.tts("Hello world", speaker_name="Claribel Dervla")
>>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav")
"""
def __init__(self, api_token=None):
self.api_token = api_token
self.api_prefix = "/api/v2"
self.headers = None
self._speakers = None
self._check_token()
@property
def speakers(self):
if self._speakers is None:
self._speakers = self.list_all_speakers()
return self._speakers
@property
def emotions(self):
"""Return a list of available emotions.
TODO: Get this from the API endpoint.
"""
return ["Neutral", "Happy", "Sad", "Angry", "Dull"]
def _check_token(self):
if self.api_token is None:
self.api_token = os.environ.get("COQUI_STUDIO_TOKEN")
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"}
if not self.api_token:
raise ValueError(
"No API token found for 🐸Coqui Studio voices - https://coqui.ai.\n"
"Visit 🔗https://app.coqui.ai/account to get one.\n"
"Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n"
""
)
def list_all_speakers(self):
"""Return both built-in Coqui Studio speakers and custom voices created by the user."""
return self.list_speakers() + self.list_voices()
def list_speakers(self):
"""List built-in Coqui Studio speakers."""
self._check_token()
conn = http.client.HTTPSConnection("app.coqui.ai")
conn.request("GET", f"{self.api_prefix}/speakers", headers=self.headers)
res = conn.getresponse()
data = res.read()
return [Speaker(s) for s in json.loads(data)["result"]]
def list_voices(self):
"""List custom voices created by the user."""
conn = http.client.HTTPSConnection("app.coqui.ai")
conn.request("GET", f"{self.api_prefix}/voices", headers=self.headers)
res = conn.getresponse()
data = res.read()
return [Speaker(s, True) for s in json.loads(data)["result"]]
def list_speakers_as_tts_models(self):
"""List speakers in ModelManager format."""
models = []
for speaker in self.speakers:
model = f"coqui_studio/en/{speaker.name}/coqui_studio"
models.append(model)
return models
def name_to_speaker(self, name):
for speaker in self.speakers:
if speaker.name == name:
return speaker
raise ValueError(f"Speaker {name} not found.")
def id_to_speaker(self, speaker_id):
for speaker in self.speakers:
if speaker.id == speaker_id:
return speaker
raise ValueError(f"Speaker {speaker_id} not found.")
@staticmethod
def url_to_np(url):
tmp_file, _ = urllib.request.urlretrieve(url)
rate, data = wavfile.read(tmp_file)
return data, rate
@staticmethod
def _create_payload(text, speaker, emotion, speed):
payload = {}
if speaker.is_voice:
payload["voice_id"] = speaker.id
else:
payload["speaker_id"] = speaker.id
payload.update(
{
"emotion": emotion,
"name": speaker.name,
"text": text,
"speed": speed,
}
)
return payload
def tts(
self,
text: str,
speaker_name: str = None,
speaker_id=None,
emotion="Neutral",
speed=1.0,
language=None, # pylint: disable=unused-argument
) -> Tuple[np.ndarray, int]:
"""Synthesize speech from text.
Args:
text (str): Text to synthesize.
speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
voices (user generated speakers) with `list_voices()`.
speaker_id (str): Speaker ID. If None, the speaker name is used.
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
speed (float): Speed of the speech. 1.0 is normal speed.
language (str): Language of the text. If None, the default language of the speaker is used.
"""
self._check_token()
if speaker_name is None and speaker_id is None:
raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.")
if speaker_id is None:
speaker = self.name_to_speaker(speaker_name)
else:
speaker = self.id_to_speaker(speaker_id)
conn = http.client.HTTPSConnection("app.coqui.ai")
payload = self._create_payload(text, speaker, emotion, speed)
conn.request("POST", "/api/v2/samples", json.dumps(payload), self.headers)
res = conn.getresponse()
data = res.read()
try:
wav, sr = self.url_to_np(json.loads(data)["audio_url"])
except KeyError as e:
raise ValueError(f" [!] 🐸 API returned error: {data}") from e
return wav, sr
def tts_to_file(
self,
text: str,
speaker_name: str,
speaker_id=None,
emotion="Neutral",
speed=1.0,
language=None,
file_path: str = None,
) -> str:
"""Synthesize speech from text and save it to a file.
Args:
text (str): Text to synthesize.
speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and
voices (user generated speakers) with `list_voices()`.
speaker_id (str): Speaker ID. If None, the speaker name is used.
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull".
speed (float): Speed of the speech. 1.0 is normal speed.
language (str): Language of the text. If None, the default language of the speaker is used.
file_path (str): Path to save the file. If None, a temporary file is created.
"""
if file_path is None:
file_path = tempfile.mktemp(".wav")
wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language)
wavfile.write(file_path, sr, wav)
return file_path
class TTS: class TTS:
"""TODO: Add voice conversion and Capacitron support.""" """TODO: Add voice conversion and Capacitron support."""
@ -54,9 +270,12 @@ class TTS:
self.synthesizer = None self.synthesizer = None
self.voice_converter = None self.voice_converter = None
self.csapi = None
self.model_name = None
if model_name: if model_name:
self.load_tts_model_by_name(model_name, gpu) self.load_tts_model_by_name(model_name, gpu)
if model_path: if model_path:
self.load_tts_model_by_path( self.load_tts_model_by_path(
model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu
@ -72,6 +291,10 @@ class TTS:
return self.synthesizer.tts_model.speaker_manager.num_speakers > 1 return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
return False return False
@property
def is_coqui_studio(self):
return "coqui_studio" in self.model_name
@property @property
def is_multi_lingual(self): def is_multi_lingual(self):
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager: if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager:
@ -96,8 +319,14 @@ class TTS:
@staticmethod @staticmethod
def list_models(): def list_models():
try:
csapi = CS_API()
models = csapi.list_speakers_as_tts_models()
except ValueError as e:
print(e)
models = []
manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False) manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
return manager.list_tts_models() return manager.list_tts_models() + models
def download_model_by_name(self, model_name: str): def download_model_by_name(self, model_name: str):
model_path, config_path, model_item = self.manager.download_model(model_name) model_path, config_path, model_item = self.manager.download_model(model_name)
@ -125,22 +354,28 @@ class TTS:
TODO: Add tests TODO: Add tests
""" """
self.synthesizer = None
self.csapi = None
self.model_name = model_name
model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name) if "coqui_studio" in model_name:
self.csapi = CS_API()
else:
model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name)
# init synthesizer # init synthesizer
# None values are fetch from the model # None values are fetch from the model
self.synthesizer = Synthesizer( self.synthesizer = Synthesizer(
tts_checkpoint=model_path, tts_checkpoint=model_path,
tts_config_path=config_path, tts_config_path=config_path,
tts_speakers_file=None, tts_speakers_file=None,
tts_languages_file=None, tts_languages_file=None,
vocoder_checkpoint=vocoder_path, vocoder_checkpoint=vocoder_path,
vocoder_config=vocoder_config_path, vocoder_config=vocoder_config_path,
encoder_checkpoint=None, encoder_checkpoint=None,
encoder_config=None, encoder_config=None,
use_cuda=gpu, use_cuda=gpu,
) )
def load_tts_model_by_path( def load_tts_model_by_path(
self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
@ -167,17 +402,88 @@ class TTS:
use_cuda=gpu, use_cuda=gpu,
) )
def _check_arguments(self, speaker: str = None, language: str = None, speaker_wav: str = None): def _check_arguments(
if self.is_multi_speaker and (speaker is None and speaker_wav is None): self,
raise ValueError("Model is multi-speaker but no speaker is provided.") speaker: str = None,
if self.is_multi_lingual and language is None: language: str = None,
raise ValueError("Model is multi-lingual but no language is provided.") speaker_wav: str = None,
if not self.is_multi_speaker and speaker is not None: emotion: str = None,
raise ValueError("Model is not multi-speaker but speaker is provided.") speed: float = None,
if not self.is_multi_lingual and language is not None: ) -> None:
raise ValueError("Model is not multi-lingual but language is provided.") """Check if the arguments are valid for the model."""
if not self.is_coqui_studio:
# check for the coqui tts models
if self.is_multi_speaker and (speaker is None and speaker_wav is None):
raise ValueError("Model is multi-speaker but no `speaker` is provided.")
if self.is_multi_lingual and language is None:
raise ValueError("Model is multi-lingual but no `language` is provided.")
if not self.is_multi_speaker and speaker is not None:
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
if not self.is_multi_lingual and language is not None:
raise ValueError("Model is not multi-lingual but `language` is provided.")
if not emotion is None and not speed is None:
raise ValueError("Emotion and speed can only be used with Coqui Studio models.")
else:
if emotion is None:
emotion = "Neutral"
if speed is None:
speed = 1.0
# check for the studio models
if speaker_wav is not None:
raise ValueError("Coqui Studio models do not support `speaker_wav` argument.")
if speaker is not None:
raise ValueError("Coqui Studio models do not support `speaker` argument.")
if language is not None and language != "en":
raise ValueError("Coqui Studio models currently support only `language=en` argument.")
if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]:
raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.")
def tts(self, text: str, speaker: str = None, language: str = None, speaker_wav: str = None): def tts_coqui_studio(
self,
text: str,
speaker_name: str = None,
language: str = None,
emotion: str = "Neutral",
speed: float = 1.0,
file_path: str = None,
):
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
Args:
text (str):
Input text to synthesize.
speaker_name (str, optional):
Speaker name from Coqui Studio. Defaults to None.
language (str, optional):
Language code. Coqui Studio currently supports only English. Defaults to None.
emotion (str, optional):
Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Defaults to "Neutral".
speed (float, optional):
Speed of the speech. Defaults to 1.0.
file_path (str, optional):
Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
"""
speaker_name = self.model_name.split("/")[2]
if file_path is None:
return self.csapi.tts_to_file(
text=text,
speaker_name=speaker_name,
language=language,
speed=speed,
emotion=emotion,
file_path=file_path,
)[0]
return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0]
def tts(
self,
text: str,
speaker: str = None,
language: str = None,
speaker_wav: str = None,
emotion: str = None,
speed: float = None,
):
"""Convert text to speech. """Convert text to speech.
Args: Args:
@ -192,8 +498,17 @@ class TTS:
speaker_wav (str, optional): speaker_wav (str, optional):
Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
Defaults to None. Defaults to None.
emotion (str, optional):
Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
speed (float, optional):
Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
Defaults to None.
""" """
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav) self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed)
if self.csapi is not None:
return self.tts_coqui_studio(
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
)
wav = self.synthesizer.tts( wav = self.synthesizer.tts(
text=text, text=text,
@ -213,6 +528,8 @@ class TTS:
speaker: str = None, speaker: str = None,
language: str = None, language: str = None,
speaker_wav: str = None, speaker_wav: str = None,
emotion: str = "Neutral",
speed: float = 1.0,
file_path: str = "output.wav", file_path: str = "output.wav",
): ):
"""Convert text to speech. """Convert text to speech.
@ -229,11 +546,22 @@ class TTS:
speaker_wav (str, optional): speaker_wav (str, optional):
Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
Defaults to None. Defaults to None.
emotion (str, optional):
Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
speed (float, optional):
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
file_path (str, optional): file_path (str, optional):
Output file path. Defaults to "output.wav". Output file path. Defaults to "output.wav".
""" """
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav)
if self.csapi is not None:
return self.tts_coqui_studio(
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path
)
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav) wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav)
self.synthesizer.save_wav(wav=wav, path=file_path) self.synthesizer.save_wav(wav=wav, path=file_path)
return file_path
def voice_conversion( def voice_conversion(
self, self,

View File

@ -290,7 +290,14 @@ class Synthesizer(object):
language_id = list(self.tts_model.language_manager.name_to_id.values())[0] language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
elif language_name and isinstance(language_name, str): elif language_name and isinstance(language_name, str):
language_id = self.tts_model.language_manager.name_to_id[language_name] try:
language_id = self.tts_model.language_manager.name_to_id[language_name]
except KeyError as e:
raise ValueError(
f" [!] Looks like you use a multi-lingual model. "
f"Language {language_name} is not in the available languages: "
f"{self.tts_model.language_manager.name_to_id.keys()}."
) from e
elif not language_name: elif not language_name:
raise ValueError( raise ValueError(

View File

@ -109,7 +109,7 @@ tts-server --model_name "<type>/<language>/<dataset>/<model_name>" \
--vocoder_name "<type>/<language>/<dataset>/<model_name>" --vocoder_name "<type>/<language>/<dataset>/<model_name>"
``` ```
## Python API ## Python 🐸TTS API
You can run a multi-speaker and multi-lingual model in Python as You can run a multi-speaker and multi-lingual model in Python as
@ -163,4 +163,34 @@ tts.tts_with_vc_to_file(
speaker_wav="target/speaker.wav", speaker_wav="target/speaker.wav",
file_path="ouptut.wav" file_path="ouptut.wav"
) )
Example text to speech using [🐸Coqui Studio](https://coqui.ai) models. You can use all of your available speakers in the studio.
[🐸Coqui Studio](https://coqui.ai) API token is required. You can get it from the [account page](https://coqui.ai/account).
You should set the `COQUI_STUDIO_TOKEN` environment variable to use the API token.
```python
# If you have a valid API token set you will see the studio speakers as separate models in the list.
# The name format is coqui_studio/en/<studio_speaker_name>/coqui_studio
models = TTS().list_models()
# Init TTS with the target studio speaker
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False, gpu=False)
# Run TTS
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
# Run TTS with emotion and speed control
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
```
If you just need 🐸 Coqui Studio speakers, you can use `CS_API`. It is a wrapper around the 🐸 Coqui Studio API.
```python
from TTS.api import CS_API
# Init 🐸 Coqui Studio API
# you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
api = CS_API(api_token=<token>)
api.speakers
api.emotions
api.list_speakers()
api.list_voices()
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
``` ```

View File

@ -2,12 +2,41 @@ import os
import unittest import unittest
from tests import get_tests_data_path, get_tests_output_path from tests import get_tests_data_path, get_tests_output_path
from TTS.api import TTS from TTS.api import CS_API, TTS
OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav") OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav")
cloning_test_wav_path = os.path.join(get_tests_data_path(), "ljspeech/wavs/LJ001-0028.wav") cloning_test_wav_path = os.path.join(get_tests_data_path(), "ljspeech/wavs/LJ001-0028.wav")
class CS_APITest(unittest.TestCase):
def test_speakers(self):
tts = CS_API()
self.assertGreater(len(tts.speakers), 1)
def test_emotions(self):
tts = CS_API()
self.assertGreater(len(tts.emotions), 1)
def test_list_calls(self):
tts = CS_API()
self.assertGreater(len(tts.list_voices()), 1)
self.assertGreater(len(tts.list_speakers()), 1)
self.assertGreater(len(tts.list_all_speakers()), 1)
self.assertGreater(len(tts.list_speakers_as_tts_models()), 1)
def test_name_to_speaker(self):
tts = CS_API()
speaker_name = tts.list_speakers_as_tts_models()[0].split("/")[2]
speaker = tts.name_to_speaker(speaker_name)
self.assertEqual(speaker.name, speaker_name)
def test_tts(self):
tts = CS_API()
wav, sr = tts.tts(text="This is a test.", speaker_name=tts.list_speakers()[0].name)
self.assertEqual(sr, 44100)
self.assertGreater(len(wav), 1)
class TTSTest(unittest.TestCase): class TTSTest(unittest.TestCase):
def test_single_speaker_model(self): def test_single_speaker_model(self):
tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False) tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False)
@ -26,6 +55,30 @@ class TTSTest(unittest.TestCase):
self.assertIsNone(tts.speakers) self.assertIsNone(tts.speakers)
self.assertIsNone(tts.languages) self.assertIsNone(tts.languages)
def test_studio_model(self):
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio")
tts.tts_to_file(text="This is a test.")
# check speed > 2.0 raises error
raised_error = False
try:
_ = tts.tts(text="This is a test.", speed=4.0, emotion="Sad") # should raise error with speed > 2.0
except ValueError:
raised_error = True
self.assertTrue(raised_error)
# check emotion is invalid
raised_error = False
try:
_ = tts.tts(text="This is a test.", speed=2.0, emotion="No Emo") # should raise error with speed > 2.0
except ValueError:
raised_error = True
self.assertTrue(raised_error)
# check valid call
wav = tts.tts(text="This is a test.", speed=2.0, emotion="Sad")
self.assertGreater(len(wav), 0)
def test_multi_speaker_multi_lingual_model(self): def test_multi_speaker_multi_lingual_model(self):
tts = TTS() tts = TTS()
tts.load_tts_model_by_name(tts.models[0]) # YourTTS tts.load_tts_model_by_name(tts.models[0]) # YourTTS