Merge pull request #3173 from coqui-ai/dev

v0.20.2
This commit is contained in:
Eren Gölge 2023-11-08 16:08:22 +01:00 committed by GitHub
commit ab57c36c2b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 383 additions and 437 deletions

View File

@ -2,7 +2,7 @@
## 🐸Coqui.ai News ## 🐸Coqui.ai News
- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board. - 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech). - 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
- 📣 ⓍTTS can now stream with <200ms latency. - 📣 ⓍTTS can now stream with <200ms latency.
- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html) - 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html) - 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS. - 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
@ -205,7 +205,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
print(TTS().list_models()) print(TTS().list_models())
# Init TTS # Init TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device) tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
# Run TTS # Run TTS
# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
@ -267,19 +267,13 @@ models = TTS(cs_api_model="XTTS").list_models()
# Init TTS with the target studio speaker # Init TTS with the target studio speaker
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False) tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
# Run TTS # Run TTS
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH) tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH)
# V1 model # V1 model
models = TTS(cs_api_model="V1").list_models() models = TTS(cs_api_model="V1").list_models()
# Run TTS with emotion and speed control # Run TTS with emotion and speed control
# Emotion control only works with V1 model # Emotion control only works with V1 model
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5) tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
# XTTS-multilingual
models = TTS(cs_api_model="XTTS-multilingual").list_models()
# Run TTS with emotion and speed control
# Emotion control only works with V1 model
tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0)
``` ```
#### Example text to speech using **Fairseq models in ~1100 languages** 🤯. #### Example text to speech using **Fairseq models in ~1100 languages** 🤯.

View File

@ -1 +1 @@
0.20.1 0.20.2

View File

@ -60,7 +60,7 @@ class TTS(nn.Module):
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None. vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
"XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control. "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
Defaults to "XTTS". Defaults to "XTTS".
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
""" """
@ -275,7 +275,7 @@ class TTS(nn.Module):
speaker_name (str, optional): speaker_name (str, optional):
Speaker name from Coqui Studio. Defaults to None. Speaker name from Coqui Studio. Defaults to None.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". supported by `XTTS` model.
emotion (str, optional): emotion (str, optional):
Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
with "V1" model. Defaults to None. with "V1" model. Defaults to None.
@ -321,7 +321,7 @@ class TTS(nn.Module):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". supported by `XTTS` model.
speaker_wav (str, optional): speaker_wav (str, optional):
Path to a reference wav file to use for voice cloning with supporting models like YourTTS. Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
Defaults to None. Defaults to None.

View File

@ -227,7 +227,7 @@ def main():
parser.add_argument( parser.add_argument(
"--cs_model", "--cs_model",
type=str, type=str,
help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.", help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
) )
parser.add_argument( parser.add_argument(
"--emotion", "--emotion",
@ -238,7 +238,7 @@ def main():
parser.add_argument( parser.add_argument(
"--language", "--language",
type=str, type=str,
help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.", help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
default=None, default=None,
) )
parser.add_argument( parser.add_argument(

View File

@ -43,7 +43,7 @@ class CS_API:
Args: Args:
api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
`COQUI_STUDIO_TOKEN`. `COQUI_STUDIO_TOKEN`.
model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`. model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
Example listing all available speakers: Example listing all available speakers:
@ -65,7 +65,7 @@ class CS_API:
Example with multi-language model: Example with multi-language model:
>>> from TTS.api import CS_API >>> from TTS.api import CS_API
>>> tts = CS_API(model="XTTS-multilang") >>> tts = CS_API(model="XTTS")
>>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en") >>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
""" """
@ -78,16 +78,12 @@ class CS_API:
"XTTS": { "XTTS": {
"list_speakers": "https://app.coqui.ai/api/v2/speakers", "list_speakers": "https://app.coqui.ai/api/v2/speakers",
"synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/", "synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
"list_voices": "https://app.coqui.ai/api/v2/voices/xtts/", "list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
},
"XTTS-multilang": {
"list_speakers": "https://app.coqui.ai/api/v2/speakers",
"synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/",
"list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
}, },
} }
SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"]
SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
def __init__(self, api_token=None, model="XTTS"): def __init__(self, api_token=None, model="XTTS"):
self.api_token = api_token self.api_token = api_token
@ -139,7 +135,7 @@ class CS_API:
self._check_token() self._check_token()
conn = http.client.HTTPSConnection("app.coqui.ai") conn = http.client.HTTPSConnection("app.coqui.ai")
url = self.MODEL_ENDPOINTS[self.model]["list_speakers"] url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
conn.request("GET", f"{url}?per_page=100", headers=self.headers) conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
res = conn.getresponse() res = conn.getresponse()
data = res.read() data = res.read()
return [Speaker(s) for s in json.loads(data)["result"]] return [Speaker(s) for s in json.loads(data)["result"]]
@ -148,7 +144,7 @@ class CS_API:
"""List custom voices created by the user.""" """List custom voices created by the user."""
conn = http.client.HTTPSConnection("app.coqui.ai") conn = http.client.HTTPSConnection("app.coqui.ai")
url = self.MODEL_ENDPOINTS[self.model]["list_voices"] url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
conn.request("GET", f"{url}", headers=self.headers) conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
res = conn.getresponse() res = conn.getresponse()
data = res.read() data = res.read()
return [Speaker(s, True) for s in json.loads(data)["result"]] return [Speaker(s, True) for s in json.loads(data)["result"]]
@ -197,14 +193,6 @@ class CS_API:
} }
) )
elif model == "XTTS": elif model == "XTTS":
payload.update(
{
"name": speaker.name,
"text": text,
"speed": speed,
}
)
elif model == "XTTS-multilang":
payload.update( payload.update(
{ {
"name": speaker.name, "name": speaker.name,
@ -226,13 +214,10 @@ class CS_API:
assert language is None, "❗ language is not supported for V1 model." assert language is None, "❗ language is not supported for V1 model."
elif self.model == "XTTS": elif self.model == "XTTS":
assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model." assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model." assert language is not None, "❗ Language is required for XTTS model."
elif self.model == "XTTS-multilang":
assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model."
assert language is not None, "❗ Language is required for XTTS-multilang model."
assert ( assert (
language in self.SUPPORTED_LANGUAGES language in self.SUPPORTED_LANGUAGES
), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl" ), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
return text, speaker_name, speaker_id, emotion, speed, language return text, speaker_name, speaker_id, emotion, speed, language
def tts( def tts(
@ -255,7 +240,7 @@ class CS_API:
supported by `V1` model. Defaults to None. supported by `V1` model. Defaults to None.
speed (float): Speed of the speech. 1.0 is normal speed. speed (float): Speed of the speech. 1.0 is normal speed.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
""" """
self._check_token() self._check_token()
self.ping_api() self.ping_api()
@ -305,7 +290,7 @@ class CS_API:
speed (float): Speed of the speech. 1.0 is normal speed. speed (float): Speed of the speech. 1.0 is normal speed.
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe. pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en". supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
file_path (str): Path to save the file. If None, a temporary file is created. file_path (str): Path to save the file. If None, a temporary file is created.
""" """
if file_path is None: if file_path is None:
@ -323,20 +308,7 @@ if __name__ == "__main__":
print(api.list_speakers_as_tts_models()) print(api.list_speakers_as_tts_models())
ts = time.time() ts = time.time()
wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name) wav, sr = api.tts("It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name)
print(f" [i] XTTS took {time.time() - ts:.2f}s") print(f" [i] XTTS took {time.time() - ts:.2f}s")
filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav") filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav")
api = CS_API(model="XTTS-multilang")
print(api.speakers)
ts = time.time()
wav, sr = api.tts(
"It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en"
)
print(f" [i] XTTS took {time.time() - ts:.2f}s")
filepath = api.tts_to_file(
text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en"
)

View File

@ -37,29 +37,11 @@ class XttsConfig(BaseTTSConfig):
If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
Defaults to `0.8`. Defaults to `0.8`.
cond_free_k (float):
Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
diffusion_temperature (float):
Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
are the "mean" prediction of the diffusion network and will sound bland and smeared.
Defaults to `1.0`.
num_gpt_outputs (int): num_gpt_outputs (int):
Number of samples taken from the autoregressive model, all of which are filtered using CLVP. Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
As XTTS is a probabilistic model, more samples means a higher probability of creating something "great". As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
Defaults to `16`. Defaults to `16`.
decoder_iterations (int):
Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
however. Defaults to `30`.
decoder_sampler (str):
Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
gpt_cond_len (int): gpt_cond_len (int):
Secs audio to be used as conditioning for the autoregressive model. Defaults to `3`. Secs audio to be used as conditioning for the autoregressive model. Defaults to `3`.
@ -110,11 +92,7 @@ class XttsConfig(BaseTTSConfig):
repetition_penalty: float = 2.0 repetition_penalty: float = 2.0
top_k: int = 50 top_k: int = 50
top_p: float = 0.85 top_p: float = 0.85
cond_free_k: float = 2.0
diffusion_temperature: float = 1.0
num_gpt_outputs: int = 1 num_gpt_outputs: int = 1
decoder_iterations: int = 30
decoder_sampler: str = "ddim"
# cloning # cloning
gpt_cond_len: int = 3 gpt_cond_len: int = 3

View File

@ -8,6 +8,7 @@ from hangul_romanize import Transliter
from hangul_romanize.rule import academic from hangul_romanize.rule import academic
from num2words import num2words from num2words import num2words
from tokenizers import Tokenizer from tokenizers import Tokenizer
from functools import cached_property
from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
@ -535,11 +536,50 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "
class VoiceBpeTokenizer: class VoiceBpeTokenizer:
def __init__(self, vocab_file=None): def __init__(self, vocab_file=None):
self.tokenizer = None self.tokenizer = None
self.katsu = None
if vocab_file is not None: if vocab_file is not None:
self.tokenizer = Tokenizer.from_file(vocab_file) self.tokenizer = Tokenizer.from_file(vocab_file)
self.char_limits = {
"en": 250,
"de": 253,
"fr": 273,
"es": 239,
"it": 213,
"pt": 203,
"pl": 224,
"zh-cn": 82,
"ar": 166,
"cs": 186,
"ru": 182,
"nl": 251,
"tr": 226,
"ja": 71,
"hu": 224,
"ko": 95,
}
@cached_property
def katsu(self):
import cutlet
return cutlet.Cutlet()
def check_input_length(self, txt, lang):
limit = self.char_limits.get(lang, 250)
if len(txt) > limit:
print(f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio.")
def preprocess_text(self, txt, lang):
if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]:
txt = multilingual_cleaners(txt, lang)
if lang == "zh-cn":
txt = chinese_transliterate(txt)
elif lang == "ja":
txt = japanese_cleaners(txt, self.katsu)
else:
raise NotImplementedError()
return txt
def encode(self, txt, lang): def encode(self, txt, lang):
self.check_input_length(txt, lang)
txt = self.preprocess_text(txt, lang) txt = self.preprocess_text(txt, lang)
txt = f"[{lang}]{txt}" txt = f"[{lang}]{txt}"
txt = txt.replace(" ", "[SPACE]") txt = txt.replace(" ", "[SPACE]")

View File

@ -152,19 +152,6 @@ class XttsArgs(Coqpit):
gpt_code_stride_len (int, optional): The hop_size of dvae and consequently of the gpt output. Defaults to 1024. gpt_code_stride_len (int, optional): The hop_size of dvae and consequently of the gpt output. Defaults to 1024.
gpt_use_masking_gt_prompt_approach (bool, optional): If True, it will use ground truth as prompt and it will mask the loss to avoid repetition. Defaults to True. gpt_use_masking_gt_prompt_approach (bool, optional): If True, it will use ground truth as prompt and it will mask the loss to avoid repetition. Defaults to True.
gpt_use_perceiver_resampler (bool, optional): If True, it will use perceiver resampler from flamingo paper - https://arxiv.org/abs/2204.14198. Defaults to False. gpt_use_perceiver_resampler (bool, optional): If True, it will use perceiver resampler from flamingo paper - https://arxiv.org/abs/2204.14198. Defaults to False.
For DiffTTS model:
diff_model_channels (int, optional): The number of channels for the DiffTTS model. Defaults to 1024.
diff_num_layers (int, optional): The number of layers for the DiffTTS model. Defaults to 10.
diff_in_channels (int, optional): The input channels for the DiffTTS model. Defaults to 100.
diff_out_channels (int, optional): The output channels for the DiffTTS model. Defaults to 200.
diff_in_latent_channels (int, optional): The input latent channels for the DiffTTS model. Defaults to 1024.
diff_in_tokens (int, optional): The input tokens for the DiffTTS model. Defaults to 8193.
diff_dropout (int, optional): The dropout percentage for the DiffTTS model. Defaults to 0.
diff_use_fp16 (bool, optional): Whether to use fp16 for the DiffTTS model. Defaults to False.
diff_num_heads (int, optional): The number of heads for the DiffTTS model. Defaults to 16.
diff_layer_drop (int, optional): The layer dropout percentage for the DiffTTS model. Defaults to 0.
diff_unconditioned_percentage (int, optional): The percentage of unconditioned inputs for the DiffTTS model. Defaults to 0.
""" """
gpt_batch_size: int = 1 gpt_batch_size: int = 1
@ -193,19 +180,6 @@ class XttsArgs(Coqpit):
gpt_use_masking_gt_prompt_approach: bool = True gpt_use_masking_gt_prompt_approach: bool = True
gpt_use_perceiver_resampler: bool = False gpt_use_perceiver_resampler: bool = False
# Diffusion Decoder params
diff_model_channels: int = 1024
diff_num_layers: int = 10
diff_in_channels: int = 100
diff_out_channels: int = 200
diff_in_latent_channels: int = 1024
diff_in_tokens: int = 8193
diff_dropout: int = 0
diff_use_fp16: bool = False
diff_num_heads: int = 16
diff_layer_drop: int = 0
diff_unconditioned_percentage: int = 0
# HifiGAN Decoder params # HifiGAN Decoder params
input_sample_rate: int = 22050 input_sample_rate: int = 22050
output_sample_rate: int = 24000 output_sample_rate: int = 24000
@ -426,10 +400,6 @@ class Xtts(BaseTTS):
"repetition_penalty": config.repetition_penalty, "repetition_penalty": config.repetition_penalty,
"top_k": config.top_k, "top_k": config.top_k,
"top_p": config.top_p, "top_p": config.top_p,
"cond_free_k": config.cond_free_k,
"diffusion_temperature": config.diffusion_temperature,
"decoder_iterations": config.decoder_iterations,
"decoder_sampler": config.decoder_sampler,
"gpt_cond_len": config.gpt_cond_len, "gpt_cond_len": config.gpt_cond_len,
"max_ref_len": config.max_ref_len, "max_ref_len": config.max_ref_len,
"sound_norm_refs": config.sound_norm_refs, "sound_norm_refs": config.sound_norm_refs,
@ -454,13 +424,6 @@ class Xtts(BaseTTS):
gpt_cond_len=6, gpt_cond_len=6,
max_ref_len=10, max_ref_len=10,
sound_norm_refs=False, sound_norm_refs=False,
# Decoder inference
decoder_iterations=100,
cond_free=True,
cond_free_k=2,
diffusion_temperature=1.0,
decoder_sampler="ddim",
decoder="hifigan",
**hf_generate_kwargs, **hf_generate_kwargs,
): ):
""" """
@ -603,10 +566,21 @@ class Xtts(BaseTTS):
if wav_gen_prev is not None: if wav_gen_prev is not None:
wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) : -overlap_len] wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) : -overlap_len]
if wav_overlap is not None: if wav_overlap is not None:
crossfade_wav = wav_chunk[:overlap_len] # cross fade the overlap section
crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device) if overlap_len > len(wav_chunk):
wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device) # wav_chunk is smaller than overlap_len, pass on last wav_gen
wav_chunk[:overlap_len] += crossfade_wav if wav_gen_prev is not None:
wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len):]
else:
# not expecting will hit here as problem happens on last chunk
wav_chunk = wav_gen[-overlap_len:]
return wav_chunk, wav_gen, None
else:
crossfade_wav = wav_chunk[:overlap_len]
crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device)
wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device)
wav_chunk[:overlap_len] += crossfade_wav
wav_overlap = wav_gen[-overlap_len:] wav_overlap = wav_gen[-overlap_len:]
wav_gen_prev = wav_gen wav_gen_prev = wav_gen
return wav_chunk, wav_gen_prev, wav_overlap return wav_chunk, wav_gen_prev, wav_overlap

View File

@ -109,7 +109,6 @@ class ModelManager(object):
def _list_for_model_type(self, model_type): def _list_for_model_type(self, model_type):
models_name_list = [] models_name_list = []
model_count = 1 model_count = 1
model_type = "tts_models"
models_name_list.extend(self._list_models(model_type, model_count)) models_name_list.extend(self._list_models(model_type, model_count))
return models_name_list return models_name_list
@ -298,22 +297,22 @@ class ModelManager(object):
model_item = self.set_model_url(model_item) model_item = self.set_model_url(model_item)
return model_item, model_full_name, model, md5hash return model_item, model_full_name, model, md5hash
def ask_tos(self, model_full_path): @staticmethod
def ask_tos(model_full_path):
"""Ask the user to agree to the terms of service""" """Ask the user to agree to the terms of service"""
tos_path = os.path.join(model_full_path, "tos_agreed.txt") tos_path = os.path.join(model_full_path, "tos_agreed.txt")
if not os.path.exists(tos_path): print(" > You must agree to the terms of service to use this model.")
print(" > You must agree to the terms of service to use this model.") print(" | > Please see the terms of service at https://coqui.ai/cpml.txt")
print(" | > Please see the terms of service at https://coqui.ai/cpml.txt") print(' | > "I have read, understood and agreed to the Terms and Conditions." - [y/n]')
print(' | > "I have read, understood and agreed the Terms and Conditions." - [y/n]') answer = input(" | | > ")
answer = input(" | | > ") if answer.lower() == "y":
if answer.lower() == "y": with open(tos_path, "w", encoding="utf-8") as f:
with open(tos_path, "w") as f: f.write("I have read, understood and agreed to the Terms and Conditions.")
f.write("I have read, understood ad agree the Terms and Conditions.") return True
return True return False
else:
return False
def tos_agreed(self, model_item, model_full_path): @staticmethod
def tos_agreed(model_item, model_full_path):
"""Check if the user has agreed to the terms of service""" """Check if the user has agreed to the terms of service"""
if "tos_required" in model_item and model_item["tos_required"]: if "tos_required" in model_item and model_item["tos_required"]:
tos_path = os.path.join(model_full_path, "tos_agreed.txt") tos_path = os.path.join(model_full_path, "tos_agreed.txt")

View File

@ -1,5 +1,278 @@
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import List from typing import List, Optional
from coqpit import Coqpit
from TTS.vc.configs.shared_configs import BaseVCConfig from TTS.vc.configs.shared_configs import BaseVCConfig
from TTS.vc.models.freevc import FreeVCArgs, FreeVCAudioConfig, FreeVCConfig
@dataclass
class FreeVCAudioConfig(Coqpit):
"""Audio configuration
Args:
max_wav_value (float):
The maximum value of the waveform.
input_sample_rate (int):
The sampling rate of the input waveform.
output_sample_rate (int):
The sampling rate of the output waveform.
filter_length (int):
The length of the filter.
hop_length (int):
The hop length.
win_length (int):
The window length.
n_mel_channels (int):
The number of mel channels.
mel_fmin (float):
The minimum frequency of the mel filterbank.
mel_fmax (Optional[float]):
The maximum frequency of the mel filterbank.
"""
max_wav_value: float = field(default=32768.0)
input_sample_rate: int = field(default=16000)
output_sample_rate: int = field(default=24000)
filter_length: int = field(default=1280)
hop_length: int = field(default=320)
win_length: int = field(default=1280)
n_mel_channels: int = field(default=80)
mel_fmin: float = field(default=0.0)
mel_fmax: Optional[float] = field(default=None)
@dataclass
class FreeVCArgs(Coqpit):
"""FreeVC model arguments
Args:
spec_channels (int):
The number of channels in the spectrogram.
inter_channels (int):
The number of channels in the intermediate layers.
hidden_channels (int):
The number of channels in the hidden layers.
filter_channels (int):
The number of channels in the filter layers.
n_heads (int):
The number of attention heads.
n_layers (int):
The number of layers.
kernel_size (int):
The size of the kernel.
p_dropout (float):
The dropout probability.
resblock (str):
The type of residual block.
resblock_kernel_sizes (List[int]):
The kernel sizes for the residual blocks.
resblock_dilation_sizes (List[List[int]]):
The dilation sizes for the residual blocks.
upsample_rates (List[int]):
The upsample rates.
upsample_initial_channel (int):
The number of channels in the initial upsample layer.
upsample_kernel_sizes (List[int]):
The kernel sizes for the upsample layers.
n_layers_q (int):
The number of layers in the quantization network.
use_spectral_norm (bool):
Whether to use spectral normalization.
gin_channels (int):
The number of channels in the global conditioning vector.
ssl_dim (int):
The dimension of the self-supervised learning embedding.
use_spk (bool):
Whether to use external speaker encoder.
"""
spec_channels: int = field(default=641)
inter_channels: int = field(default=192)
hidden_channels: int = field(default=192)
filter_channels: int = field(default=768)
n_heads: int = field(default=2)
n_layers: int = field(default=6)
kernel_size: int = field(default=3)
p_dropout: float = field(default=0.1)
resblock: str = field(default="1")
resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11])
resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2])
upsample_initial_channel: int = field(default=512)
upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
n_layers_q: int = field(default=3)
use_spectral_norm: bool = field(default=False)
gin_channels: int = field(default=256)
ssl_dim: int = field(default=1024)
use_spk: bool = field(default=False)
num_spks: int = field(default=0)
segment_size: int = field(default=8960)
@dataclass
class FreeVCConfig(BaseVCConfig):
"""Defines parameters for FreeVC End2End TTS model.
Args:
model (str):
Model name. Do not change unless you know what you are doing.
model_args (FreeVCArgs):
Model architecture arguments. Defaults to `FreeVCArgs()`.
audio (FreeVCAudioConfig):
Audio processing configuration. Defaults to `FreeVCAudioConfig()`.
grad_clip (List):
Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
lr_gen (float):
Initial learning rate for the generator. Defaults to 0.0002.
lr_disc (float):
Initial learning rate for the discriminator. Defaults to 0.0002.
lr_scheduler_gen (str):
Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
`ExponentialLR`.
lr_scheduler_gen_params (dict):
Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
lr_scheduler_disc (str):
Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
`ExponentialLR`.
lr_scheduler_disc_params (dict):
Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
scheduler_after_epoch (bool):
If true, step the schedulers after each epoch else after each step. Defaults to `False`.
optimizer (str):
Name of the optimizer to use with both the generator and the discriminator networks. One of the
`torch.optim.*`. Defaults to `AdamW`.
kl_loss_alpha (float):
Loss weight for KL loss. Defaults to 1.0.
disc_loss_alpha (float):
Loss weight for the discriminator loss. Defaults to 1.0.
gen_loss_alpha (float):
Loss weight for the generator loss. Defaults to 1.0.
feat_loss_alpha (float):
Loss weight for the feature matching loss. Defaults to 1.0.
mel_loss_alpha (float):
Loss weight for the mel loss. Defaults to 45.0.
return_wav (bool):
If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
compute_linear_spec (bool):
If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
use_weighted_sampler (bool):
If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
weighted_sampler_attrs (dict):
Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
by overweighting `root_path` by 2.0. Defaults to `{}`.
weighted_sampler_multipliers (dict):
Weight each unique value of a key returned by the formatter for weighted sampling.
For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
r (int):
Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
add_blank (bool):
If true, a blank token is added in between every character. Defaults to `True`.
test_sentences (List[List]):
List of sentences with speaker and language information to be used for testing.
language_ids_file (str):
Path to the language ids file.
use_language_embedding (bool):
If true, language embedding is used. Defaults to `False`.
Note:
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
Example:
>>> from TTS.vc.configs.freevc_config import FreeVCConfig
>>> config = FreeVCConfig()
"""
model: str = "freevc"
# model specific params
model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
# optimizer
# TODO with training support
# loss params
# TODO with training support
# data loader params
return_wav: bool = True
compute_linear_spec: bool = True
# sampler params
use_weighted_sampler: bool = False # TODO: move it to the base config
weighted_sampler_attrs: dict = field(default_factory=lambda: {})
weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
# overrides
r: int = 1 # DO NOT CHANGE
add_blank: bool = True
# multi-speaker settings
# use speaker embedding layer
num_speakers: int = 0
speakers_file: str = None
speaker_embedding_channels: int = 256
# use d-vectors
use_d_vector_file: bool = False
d_vector_file: List[str] = None
d_vector_dim: int = None
def __post_init__(self):
for key, val in self.model_args.items():
if hasattr(self, key):
self[key] = val

View File

@ -1,4 +1,3 @@
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple, Union from typing import Dict, List, Optional, Tuple, Union
import librosa import librosa
@ -13,8 +12,8 @@ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
import TTS.vc.modules.freevc.commons as commons import TTS.vc.modules.freevc.commons as commons
import TTS.vc.modules.freevc.modules as modules import TTS.vc.modules.freevc.modules as modules
from TTS.tts.utils.speakers import SpeakerManager from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.io import load_fsspec, save_checkpoint from TTS.utils.io import load_fsspec
from TTS.vc.configs.shared_configs import BaseVCConfig from TTS.vc.configs.freevc_config import FreeVCConfig
from TTS.vc.models.base_vc import BaseVC from TTS.vc.models.base_vc import BaseVC
from TTS.vc.modules.freevc.commons import get_padding, init_weights from TTS.vc.modules.freevc.commons import get_padding, init_weights
from TTS.vc.modules.freevc.mel_processing import mel_spectrogram_torch from TTS.vc.modules.freevc.mel_processing import mel_spectrogram_torch
@ -294,136 +293,6 @@ class SpeakerEncoder(torch.nn.Module):
return embed return embed
@dataclass
class FreeVCAudioConfig(Coqpit):
"""Audio configuration
Args:
max_wav_value (float):
The maximum value of the waveform.
input_sample_rate (int):
The sampling rate of the input waveform.
output_sample_rate (int):
The sampling rate of the output waveform.
filter_length (int):
The length of the filter.
hop_length (int):
The hop length.
win_length (int):
The window length.
n_mel_channels (int):
The number of mel channels.
mel_fmin (float):
The minimum frequency of the mel filterbank.
mel_fmax (Optional[float]):
The maximum frequency of the mel filterbank.
"""
max_wav_value: float = field(default=32768.0)
input_sample_rate: int = field(default=16000)
output_sample_rate: int = field(default=24000)
filter_length: int = field(default=1280)
hop_length: int = field(default=320)
win_length: int = field(default=1280)
n_mel_channels: int = field(default=80)
mel_fmin: float = field(default=0.0)
mel_fmax: Optional[float] = field(default=None)
@dataclass
class FreeVCArgs(Coqpit):
"""FreeVC model arguments
Args:
spec_channels (int):
The number of channels in the spectrogram.
inter_channels (int):
The number of channels in the intermediate layers.
hidden_channels (int):
The number of channels in the hidden layers.
filter_channels (int):
The number of channels in the filter layers.
n_heads (int):
The number of attention heads.
n_layers (int):
The number of layers.
kernel_size (int):
The size of the kernel.
p_dropout (float):
The dropout probability.
resblock (str):
The type of residual block.
resblock_kernel_sizes (List[int]):
The kernel sizes for the residual blocks.
resblock_dilation_sizes (List[List[int]]):
The dilation sizes for the residual blocks.
upsample_rates (List[int]):
The upsample rates.
upsample_initial_channel (int):
The number of channels in the initial upsample layer.
upsample_kernel_sizes (List[int]):
The kernel sizes for the upsample layers.
n_layers_q (int):
The number of layers in the quantization network.
use_spectral_norm (bool):
Whether to use spectral normalization.
gin_channels (int):
The number of channels in the global conditioning vector.
ssl_dim (int):
The dimension of the self-supervised learning embedding.
use_spk (bool):
Whether to use external speaker encoder.
"""
spec_channels: int = field(default=641)
inter_channels: int = field(default=192)
hidden_channels: int = field(default=192)
filter_channels: int = field(default=768)
n_heads: int = field(default=2)
n_layers: int = field(default=6)
kernel_size: int = field(default=3)
p_dropout: float = field(default=0.1)
resblock: str = field(default="1")
resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11])
resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2])
upsample_initial_channel: int = field(default=512)
upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
n_layers_q: int = field(default=3)
use_spectral_norm: bool = field(default=False)
gin_channels: int = field(default=256)
ssl_dim: int = field(default=1024)
use_spk: bool = field(default=False)
num_spks: int = field(default=0)
segment_size: int = field(default=8960)
class FreeVC(BaseVC): class FreeVC(BaseVC):
""" """
@ -677,7 +546,7 @@ class FreeVC(BaseVC):
... ...
@staticmethod @staticmethod
def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None, verbose=True): def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True):
model = FreeVC(config) model = FreeVC(config)
return model return model
@ -689,145 +558,3 @@ class FreeVC(BaseVC):
def train_step(): def train_step():
... ...
@dataclass
class FreeVCConfig(BaseVCConfig):
"""Defines parameters for FreeVC End2End TTS model.
Args:
model (str):
Model name. Do not change unless you know what you are doing.
model_args (FreeVCArgs):
Model architecture arguments. Defaults to `FreeVCArgs()`.
audio (FreeVCAudioConfig):
Audio processing configuration. Defaults to `FreeVCAudioConfig()`.
grad_clip (List):
Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
lr_gen (float):
Initial learning rate for the generator. Defaults to 0.0002.
lr_disc (float):
Initial learning rate for the discriminator. Defaults to 0.0002.
lr_scheduler_gen (str):
Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
`ExponentialLR`.
lr_scheduler_gen_params (dict):
Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
lr_scheduler_disc (str):
Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
`ExponentialLR`.
lr_scheduler_disc_params (dict):
Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
scheduler_after_epoch (bool):
If true, step the schedulers after each epoch else after each step. Defaults to `False`.
optimizer (str):
Name of the optimizer to use with both the generator and the discriminator networks. One of the
`torch.optim.*`. Defaults to `AdamW`.
kl_loss_alpha (float):
Loss weight for KL loss. Defaults to 1.0.
disc_loss_alpha (float):
Loss weight for the discriminator loss. Defaults to 1.0.
gen_loss_alpha (float):
Loss weight for the generator loss. Defaults to 1.0.
feat_loss_alpha (float):
Loss weight for the feature matching loss. Defaults to 1.0.
mel_loss_alpha (float):
Loss weight for the mel loss. Defaults to 45.0.
return_wav (bool):
If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
compute_linear_spec (bool):
If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
use_weighted_sampler (bool):
If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
weighted_sampler_attrs (dict):
Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
by overweighting `root_path` by 2.0. Defaults to `{}`.
weighted_sampler_multipliers (dict):
Weight each unique value of a key returned by the formatter for weighted sampling.
For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
r (int):
Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
add_blank (bool):
If true, a blank token is added in between every character. Defaults to `True`.
test_sentences (List[List]):
List of sentences with speaker and language information to be used for testing.
language_ids_file (str):
Path to the language ids file.
use_language_embedding (bool):
If true, language embedding is used. Defaults to `False`.
Note:
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
Example:
>>> from TTS.tts.configs.freevc_config import FreeVCConfig
>>> config = FreeVCConfig()
"""
model: str = "freevc"
# model specific params
model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
# optimizer
# TODO with training support
# loss params
# TODO with training support
# data loader params
return_wav: bool = True
compute_linear_spec: bool = True
# sampler params
use_weighted_sampler: bool = False # TODO: move it to the base config
weighted_sampler_attrs: dict = field(default_factory=lambda: {})
weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
# overrides
r: int = 1 # DO NOT CHANGE
add_blank: bool = True
# multi-speaker settings
# use speaker embedding layer
num_speakers: int = 0
speakers_file: str = None
speaker_embedding_channels: int = 256
# use d-vectors
use_d_vector_file: bool = False
d_vector_file: List[str] = None
d_vector_dim: int = None
def __post_init__(self):
for key, val in self.model_args.items():
if hasattr(self, key):
self[key] = val

View File

@ -195,10 +195,10 @@ def _apply_D_loss(scores_fake, scores_real, loss_func):
if isinstance(scores_fake, list): if isinstance(scores_fake, list):
# multi-scale loss # multi-scale loss
for score_fake, score_real in zip(scores_fake, scores_real): for score_fake, score_real in zip(scores_fake, scores_real):
total_loss, real_loss, fake_loss = loss_func(score_fake=score_fake, score_real=score_real) total_loss, real_loss_, fake_loss_ = loss_func(score_fake=score_fake, score_real=score_real)
loss += total_loss loss += total_loss
real_loss += real_loss real_loss += real_loss_
fake_loss += fake_loss fake_loss += fake_loss_
# normalize loss values with number of scales (discriminators) # normalize loss values with number of scales (discriminators)
loss /= len(scores_fake) loss /= len(scores_fake)
real_loss /= len(scores_real) real_loss /= len(scores_real)

View File

@ -124,7 +124,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
print(TTS().list_models()) print(TTS().list_models())
# Init TTS # Init TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device) tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
# Run TTS # Run TTS
# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
@ -198,19 +198,12 @@ from TTS.api import CS_API
# Init 🐸 Coqui Studio API # Init 🐸 Coqui Studio API
# you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument. # you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
# XTTS - Best quality and life-like speech in EN # XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
api = CS_API(api_token=<token>, model="XTTS") api = CS_API(api_token=<token>, model="XTTS")
api.speakers # all the speakers are available with all the models. api.speakers # all the speakers are available with all the models.
api.list_speakers() api.list_speakers()
api.list_voices() api.list_voices()
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5) wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5)
# XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon)
api = CS_API(api_token=<token>, model="XTTS-multilingual")
api.speakers
api.list_speakers()
api.list_voices()
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
# V1 - Fast and lightweight TTS in EN with emotion control. # V1 - Fast and lightweight TTS in EN with emotion control.
api = CS_API(api_token=<token>, model="V1") api = CS_API(api_token=<token>, model="V1")
@ -238,4 +231,4 @@ api.tts_with_vc_to_file(
speaker_wav="target/speaker.wav", speaker_wav="target/speaker.wav",
file_path="ouptut.wav" file_path="ouptut.wav"
) )
``` ```

View File

@ -24,8 +24,7 @@ a few tricks to make it faster and support streaming inference.
Current implementation only supports inference. Current implementation only supports inference.
### Languages ### Languages
As of now, XTTS-v2 supports 16 languages: English, Spanish, French, German, Italian, Portuguese, As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko).
Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese (Simplified), Japanese, Hungarian, Korean
Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out. Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out.
@ -116,7 +115,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
model.cuda() model.cuda()
print("Computing speaker latents...") print("Computing speaker latents...")
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
print("Inference...") print("Inference...")
out = model.inference( out = model.inference(
@ -124,7 +123,6 @@ out = model.inference(
"en", "en",
gpt_cond_latent, gpt_cond_latent,
speaker_embedding, speaker_embedding,
diffusion_conditioning,
temperature=0.7, # Add custom parameters here temperature=0.7, # Add custom parameters here
) )
torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000) torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
@ -153,7 +151,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
model.cuda() model.cuda()
print("Computing speaker latents...") print("Computing speaker latents...")
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"]) gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
print("Inference...") print("Inference...")
t0 = time.time() t0 = time.time()
@ -210,7 +208,7 @@ model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENI
model.cuda() model.cuda()
print("Computing speaker latents...") print("Computing speaker latents...")
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE]) gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
print("Inference...") print("Inference...")
out = model.inference( out = model.inference(
@ -218,7 +216,6 @@ out = model.inference(
"en", "en",
gpt_cond_latent, gpt_cond_latent,
speaker_embedding, speaker_embedding,
diffusion_conditioning,
temperature=0.7, # Add custom parameters here temperature=0.7, # Add custom parameters here
) )
torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000) torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000)

View File

@ -14,7 +14,6 @@ from TTS.utils.manage import ModelManager
MODELS_WITH_SEP_TESTS = [ MODELS_WITH_SEP_TESTS = [
"tts_models/multilingual/multi-dataset/bark", "tts_models/multilingual/multi-dataset/bark",
"tts_models/en/multi-dataset/tortoise-v2", "tts_models/en/multi-dataset/tortoise-v2",
"tts_models/multilingual/multi-dataset/xtts_v1",
"tts_models/multilingual/multi-dataset/xtts_v1.1", "tts_models/multilingual/multi-dataset/xtts_v1.1",
"tts_models/multilingual/multi-dataset/xtts_v2", "tts_models/multilingual/multi-dataset/xtts_v2",
] ]
@ -83,14 +82,14 @@ def test_xtts():
if use_gpu: if use_gpu:
run_cli( run_cli(
"yes | " "yes | "
f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1 " f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 "
f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True ' f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True '
f'--speaker_wav "{speaker_wav}" --language_idx "en"' f'--speaker_wav "{speaker_wav}" --language_idx "en"'
) )
else: else:
run_cli( run_cli(
"yes | " "yes | "
f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1 " f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 "
f'--text "This is an example." --out_path "{output_path}" --progress_bar False ' f'--text "This is an example." --out_path "{output_path}" --progress_bar False '
f'--speaker_wav "{speaker_wav}" --language_idx "en"' f'--speaker_wav "{speaker_wav}" --language_idx "en"'
) )
@ -104,7 +103,7 @@ def test_xtts_streaming():
speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")] speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav") speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
speaker_wav.append(speaker_wav_2) speaker_wav.append(speaker_wav_2)
model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1") model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1.1")
config = XttsConfig() config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json")) config.load_json(os.path.join(model_path, "config.json"))
model = Xtts.init_from_config(config) model = Xtts.init_from_config(config)