Merge pull request #3173 from coqui-ai/dev

v0.20.2
This commit is contained in:
Eren Gölge 2023-11-08 16:08:22 +01:00 committed by GitHub
commit ab57c36c2b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 383 additions and 437 deletions

View File

@ -2,7 +2,7 @@
## 🐸Coqui.ai News
- 📣 ⓍTTSv2 is here with 16 languages and better performance across the board.
- 📣 ⓍTTS fine-tuning code is out. Check the [example recipes](https://github.com/coqui-ai/TTS/tree/dev/recipes/ljspeech).
- 📣 ⓍTTS can now stream with <200ms latency.
- 📣 ⓍTTS can now stream with <200ms latency.
- 📣 ⓍTTS, our production TTS model that can speak 13 languages, is released [Blog Post](https://coqui.ai/blog/tts/open_xtts), [Demo](https://huggingface.co/spaces/coqui/xtts), [Docs](https://tts.readthedocs.io/en/dev/models/xtts.html)
- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
@ -205,7 +205,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
print(TTS().list_models())
# Init TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device)
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
# Run TTS
# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
@ -267,19 +267,13 @@ models = TTS(cs_api_model="XTTS").list_models()
# Init TTS with the target studio speaker
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
# Run TTS
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH)
tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH)
# V1 model
models = TTS(cs_api_model="V1").list_models()
# Run TTS with emotion and speed control
# Emotion control only works with V1 model
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
# XTTS-multilingual
models = TTS(cs_api_model="XTTS-multilingual").list_models()
# Run TTS with emotion and speed control
# Emotion control only works with V1 model
tts.tts_to_file(text="Das ist ein Test.", file_path=OUTPUT_PATH, language="de", speed=1.0)
```
#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.

View File

@ -1 +1 @@
0.20.1
0.20.2

View File

@ -60,7 +60,7 @@ class TTS(nn.Module):
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
"XTTS", "XTTS-multilingual", "V1". You can also use `TTS.cs_api.CS_API" for more control.
"XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
Defaults to "XTTS".
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
"""
@ -275,7 +275,7 @@ class TTS(nn.Module):
speaker_name (str, optional):
Speaker name from Coqui Studio. Defaults to None.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
supported by `XTTS` model.
emotion (str, optional):
Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
with "V1" model. Defaults to None.
@ -321,7 +321,7 @@ class TTS(nn.Module):
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
supported by `XTTS` model.
speaker_wav (str, optional):
Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
Defaults to None.

View File

@ -227,7 +227,7 @@ def main():
parser.add_argument(
"--cs_model",
type=str,
help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `XTTS-multilingual`, `V1`.",
help="Name of the 🐸Coqui Studio model. Available models are `XTTS`, `V1`.",
)
parser.add_argument(
"--emotion",
@ -238,7 +238,7 @@ def main():
parser.add_argument(
"--language",
type=str,
help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS-multilingual` model.",
help="Language to condition the model with. Only available for 🐸Coqui Studio `XTTS` model.",
default=None,
)
parser.add_argument(

View File

@ -43,7 +43,7 @@ class CS_API:
Args:
api_token (str): 🐸Coqui Studio API token. If not provided, it will be read from the environment variable
`COQUI_STUDIO_TOKEN`.
model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`, or `XTTS-multilang`. Default is `XTTS`.
model (str): 🐸Coqui Studio model. It can be either `V1`, `XTTS`. Default is `XTTS`.
Example listing all available speakers:
@ -65,7 +65,7 @@ class CS_API:
Example with multi-language model:
>>> from TTS.api import CS_API
>>> tts = CS_API(model="XTTS-multilang")
>>> tts = CS_API(model="XTTS")
>>> wav, sr = api.tts("Hello world", speaker_name=tts.speakers[0].name, language="en")
"""
@ -78,16 +78,12 @@ class CS_API:
"XTTS": {
"list_speakers": "https://app.coqui.ai/api/v2/speakers",
"synthesize": "https://app.coqui.ai/api/v2/samples/xtts/render/",
"list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
},
"XTTS-multilang": {
"list_speakers": "https://app.coqui.ai/api/v2/speakers",
"synthesize": "https://app.coqui.ai/api/v2/samples/multilingual/render/",
"list_voices": "https://app.coqui.ai/api/v2/voices/xtts/",
"list_voices": "https://app.coqui.ai/api/v2/voices/xtts",
},
}
SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl"]
SUPPORTED_LANGUAGES = ["en", "es", "de", "fr", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja"]
def __init__(self, api_token=None, model="XTTS"):
self.api_token = api_token
@ -139,7 +135,7 @@ class CS_API:
self._check_token()
conn = http.client.HTTPSConnection("app.coqui.ai")
url = self.MODEL_ENDPOINTS[self.model]["list_speakers"]
conn.request("GET", f"{url}?per_page=100", headers=self.headers)
conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
res = conn.getresponse()
data = res.read()
return [Speaker(s) for s in json.loads(data)["result"]]
@ -148,7 +144,7 @@ class CS_API:
"""List custom voices created by the user."""
conn = http.client.HTTPSConnection("app.coqui.ai")
url = self.MODEL_ENDPOINTS[self.model]["list_voices"]
conn.request("GET", f"{url}", headers=self.headers)
conn.request("GET", f"{url}?page=1&per_page=100", headers=self.headers)
res = conn.getresponse()
data = res.read()
return [Speaker(s, True) for s in json.loads(data)["result"]]
@ -197,14 +193,6 @@ class CS_API:
}
)
elif model == "XTTS":
payload.update(
{
"name": speaker.name,
"text": text,
"speed": speed,
}
)
elif model == "XTTS-multilang":
payload.update(
{
"name": speaker.name,
@ -226,13 +214,10 @@ class CS_API:
assert language is None, "❗ language is not supported for V1 model."
elif self.model == "XTTS":
assert emotion is None, f"❗ Emotions are not supported for XTTS model. Use V1 model."
assert language is None, "❗ Language is not supported for XTTS model. Use XTTS-multilang model."
elif self.model == "XTTS-multilang":
assert emotion is None, f"❗ Emotions are not supported for XTTS-multilang model. Use V1 model."
assert language is not None, "❗ Language is required for XTTS-multilang model."
assert language is not None, "❗ Language is required for XTTS model."
assert (
language in self.SUPPORTED_LANGUAGES
), f"❗ Language {language} is not yet supported. Use one of: en, es, de, fr, it, pt, pl"
), f"❗ Language {language} is not yet supported. Check https://docs.coqui.ai/reference/samples_xtts_create."
return text, speaker_name, speaker_id, emotion, speed, language
def tts(
@ -255,7 +240,7 @@ class CS_API:
supported by `V1` model. Defaults to None.
speed (float): Speed of the speech. 1.0 is normal speed.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
supported by `XTTS` model. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
"""
self._check_token()
self.ping_api()
@ -305,7 +290,7 @@ class CS_API:
speed (float): Speed of the speech. 1.0 is normal speed.
pipe_out (BytesIO, optional): Flag to stdout the generated TTS wav file for shell pipe.
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
supported by `XTTS-multilang` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
supported by `XTTS` model. Currently supports en, de, es, fr, it, pt, pl. Defaults to "en".
file_path (str): Path to save the file. If None, a temporary file is created.
"""
if file_path is None:
@ -323,20 +308,7 @@ if __name__ == "__main__":
print(api.list_speakers_as_tts_models())
ts = time.time()
wav, sr = api.tts("It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name)
wav, sr = api.tts("It took me quite a long time to develop a voice.", language="en", speaker_name=api.speakers[0].name)
print(f" [i] XTTS took {time.time() - ts:.2f}s")
filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav")
api = CS_API(model="XTTS-multilang")
print(api.speakers)
ts = time.time()
wav, sr = api.tts(
"It took me quite a long time to develop a voice.", speaker_name=api.speakers[0].name, language="en"
)
print(f" [i] XTTS took {time.time() - ts:.2f}s")
filepath = api.tts_to_file(
text="Hello world!", speaker_name=api.speakers[0].name, file_path="output.wav", language="en"
)
filepath = api.tts_to_file(text="Hello world!", speaker_name=api.speakers[0].name, language="en", file_path="output.wav")

View File

@ -37,29 +37,11 @@ class XttsConfig(BaseTTSConfig):
If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.
Defaults to `0.8`.
cond_free_k (float):
Knob that determines how to balance the conditioning free signal with the conditioning-present signal. [0,inf].
As cond_free_k increases, the output becomes dominated by the conditioning-free signal.
Formula is: output=cond_present_output*(cond_free_k+1)-cond_absenct_output*cond_free_k. Defaults to `2.0`.
diffusion_temperature (float):
Controls the variance of the noise fed into the diffusion model. [0,1]. Values at 0
are the "mean" prediction of the diffusion network and will sound bland and smeared.
Defaults to `1.0`.
num_gpt_outputs (int):
Number of samples taken from the autoregressive model, all of which are filtered using CLVP.
As XTTS is a probabilistic model, more samples means a higher probability of creating something "great".
Defaults to `16`.
decoder_iterations (int):
Number of diffusion steps to perform. [0,4000]. More steps means the network has more chances to iteratively refine
the output, which should theoretically mean a higher quality output. Generally a value above 250 is not noticeably better,
however. Defaults to `30`.
decoder_sampler (str):
Diffusion sampler to be used. `ddim` or `dpm++2m`. Defaults to `ddim`.
gpt_cond_len (int):
Secs audio to be used as conditioning for the autoregressive model. Defaults to `3`.
@ -110,11 +92,7 @@ class XttsConfig(BaseTTSConfig):
repetition_penalty: float = 2.0
top_k: int = 50
top_p: float = 0.85
cond_free_k: float = 2.0
diffusion_temperature: float = 1.0
num_gpt_outputs: int = 1
decoder_iterations: int = 30
decoder_sampler: str = "ddim"
# cloning
gpt_cond_len: int = 3

View File

@ -8,6 +8,7 @@ from hangul_romanize import Transliter
from hangul_romanize.rule import academic
from num2words import num2words
from tokenizers import Tokenizer
from functools import cached_property
from TTS.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
@ -535,11 +536,50 @@ DEFAULT_VOCAB_FILE = os.path.join(os.path.dirname(os.path.realpath(__file__)), "
class VoiceBpeTokenizer:
def __init__(self, vocab_file=None):
self.tokenizer = None
self.katsu = None
if vocab_file is not None:
self.tokenizer = Tokenizer.from_file(vocab_file)
self.char_limits = {
"en": 250,
"de": 253,
"fr": 273,
"es": 239,
"it": 213,
"pt": 203,
"pl": 224,
"zh-cn": 82,
"ar": 166,
"cs": 186,
"ru": 182,
"nl": 251,
"tr": 226,
"ja": 71,
"hu": 224,
"ko": 95,
}
@cached_property
def katsu(self):
import cutlet
return cutlet.Cutlet()
def check_input_length(self, txt, lang):
limit = self.char_limits.get(lang, 250)
if len(txt) > limit:
print(f"[!] Warning: The text length exceeds the character limit of {limit} for language '{lang}', this might cause truncated audio.")
def preprocess_text(self, txt, lang):
if lang in ["en", "es", "fr", "de", "pt", "it", "pl", "ar", "cs", "ru", "nl", "tr", "zh-cn"]:
txt = multilingual_cleaners(txt, lang)
if lang == "zh-cn":
txt = chinese_transliterate(txt)
elif lang == "ja":
txt = japanese_cleaners(txt, self.katsu)
else:
raise NotImplementedError()
return txt
def encode(self, txt, lang):
self.check_input_length(txt, lang)
txt = self.preprocess_text(txt, lang)
txt = f"[{lang}]{txt}"
txt = txt.replace(" ", "[SPACE]")

View File

@ -152,19 +152,6 @@ class XttsArgs(Coqpit):
gpt_code_stride_len (int, optional): The hop_size of dvae and consequently of the gpt output. Defaults to 1024.
gpt_use_masking_gt_prompt_approach (bool, optional): If True, it will use ground truth as prompt and it will mask the loss to avoid repetition. Defaults to True.
gpt_use_perceiver_resampler (bool, optional): If True, it will use perceiver resampler from flamingo paper - https://arxiv.org/abs/2204.14198. Defaults to False.
For DiffTTS model:
diff_model_channels (int, optional): The number of channels for the DiffTTS model. Defaults to 1024.
diff_num_layers (int, optional): The number of layers for the DiffTTS model. Defaults to 10.
diff_in_channels (int, optional): The input channels for the DiffTTS model. Defaults to 100.
diff_out_channels (int, optional): The output channels for the DiffTTS model. Defaults to 200.
diff_in_latent_channels (int, optional): The input latent channels for the DiffTTS model. Defaults to 1024.
diff_in_tokens (int, optional): The input tokens for the DiffTTS model. Defaults to 8193.
diff_dropout (int, optional): The dropout percentage for the DiffTTS model. Defaults to 0.
diff_use_fp16 (bool, optional): Whether to use fp16 for the DiffTTS model. Defaults to False.
diff_num_heads (int, optional): The number of heads for the DiffTTS model. Defaults to 16.
diff_layer_drop (int, optional): The layer dropout percentage for the DiffTTS model. Defaults to 0.
diff_unconditioned_percentage (int, optional): The percentage of unconditioned inputs for the DiffTTS model. Defaults to 0.
"""
gpt_batch_size: int = 1
@ -193,19 +180,6 @@ class XttsArgs(Coqpit):
gpt_use_masking_gt_prompt_approach: bool = True
gpt_use_perceiver_resampler: bool = False
# Diffusion Decoder params
diff_model_channels: int = 1024
diff_num_layers: int = 10
diff_in_channels: int = 100
diff_out_channels: int = 200
diff_in_latent_channels: int = 1024
diff_in_tokens: int = 8193
diff_dropout: int = 0
diff_use_fp16: bool = False
diff_num_heads: int = 16
diff_layer_drop: int = 0
diff_unconditioned_percentage: int = 0
# HifiGAN Decoder params
input_sample_rate: int = 22050
output_sample_rate: int = 24000
@ -426,10 +400,6 @@ class Xtts(BaseTTS):
"repetition_penalty": config.repetition_penalty,
"top_k": config.top_k,
"top_p": config.top_p,
"cond_free_k": config.cond_free_k,
"diffusion_temperature": config.diffusion_temperature,
"decoder_iterations": config.decoder_iterations,
"decoder_sampler": config.decoder_sampler,
"gpt_cond_len": config.gpt_cond_len,
"max_ref_len": config.max_ref_len,
"sound_norm_refs": config.sound_norm_refs,
@ -454,13 +424,6 @@ class Xtts(BaseTTS):
gpt_cond_len=6,
max_ref_len=10,
sound_norm_refs=False,
# Decoder inference
decoder_iterations=100,
cond_free=True,
cond_free_k=2,
diffusion_temperature=1.0,
decoder_sampler="ddim",
decoder="hifigan",
**hf_generate_kwargs,
):
"""
@ -603,10 +566,21 @@ class Xtts(BaseTTS):
if wav_gen_prev is not None:
wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len) : -overlap_len]
if wav_overlap is not None:
crossfade_wav = wav_chunk[:overlap_len]
crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device)
wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device)
wav_chunk[:overlap_len] += crossfade_wav
# cross fade the overlap section
if overlap_len > len(wav_chunk):
# wav_chunk is smaller than overlap_len, pass on last wav_gen
if wav_gen_prev is not None:
wav_chunk = wav_gen[(wav_gen_prev.shape[0] - overlap_len):]
else:
# not expecting will hit here as problem happens on last chunk
wav_chunk = wav_gen[-overlap_len:]
return wav_chunk, wav_gen, None
else:
crossfade_wav = wav_chunk[:overlap_len]
crossfade_wav = crossfade_wav * torch.linspace(0.0, 1.0, overlap_len).to(crossfade_wav.device)
wav_chunk[:overlap_len] = wav_overlap * torch.linspace(1.0, 0.0, overlap_len).to(wav_overlap.device)
wav_chunk[:overlap_len] += crossfade_wav
wav_overlap = wav_gen[-overlap_len:]
wav_gen_prev = wav_gen
return wav_chunk, wav_gen_prev, wav_overlap

View File

@ -109,7 +109,6 @@ class ModelManager(object):
def _list_for_model_type(self, model_type):
models_name_list = []
model_count = 1
model_type = "tts_models"
models_name_list.extend(self._list_models(model_type, model_count))
return models_name_list
@ -298,22 +297,22 @@ class ModelManager(object):
model_item = self.set_model_url(model_item)
return model_item, model_full_name, model, md5hash
def ask_tos(self, model_full_path):
@staticmethod
def ask_tos(model_full_path):
"""Ask the user to agree to the terms of service"""
tos_path = os.path.join(model_full_path, "tos_agreed.txt")
if not os.path.exists(tos_path):
print(" > You must agree to the terms of service to use this model.")
print(" | > Please see the terms of service at https://coqui.ai/cpml.txt")
print(' | > "I have read, understood and agreed the Terms and Conditions." - [y/n]')
answer = input(" | | > ")
if answer.lower() == "y":
with open(tos_path, "w") as f:
f.write("I have read, understood ad agree the Terms and Conditions.")
return True
else:
return False
print(" > You must agree to the terms of service to use this model.")
print(" | > Please see the terms of service at https://coqui.ai/cpml.txt")
print(' | > "I have read, understood and agreed to the Terms and Conditions." - [y/n]')
answer = input(" | | > ")
if answer.lower() == "y":
with open(tos_path, "w", encoding="utf-8") as f:
f.write("I have read, understood and agreed to the Terms and Conditions.")
return True
return False
def tos_agreed(self, model_item, model_full_path):
@staticmethod
def tos_agreed(model_item, model_full_path):
"""Check if the user has agreed to the terms of service"""
if "tos_required" in model_item and model_item["tos_required"]:
tos_path = os.path.join(model_full_path, "tos_agreed.txt")

View File

@ -1,5 +1,278 @@
from dataclasses import dataclass, field
from typing import List
from typing import List, Optional
from coqpit import Coqpit
from TTS.vc.configs.shared_configs import BaseVCConfig
from TTS.vc.models.freevc import FreeVCArgs, FreeVCAudioConfig, FreeVCConfig
@dataclass
class FreeVCAudioConfig(Coqpit):
"""Audio configuration
Args:
max_wav_value (float):
The maximum value of the waveform.
input_sample_rate (int):
The sampling rate of the input waveform.
output_sample_rate (int):
The sampling rate of the output waveform.
filter_length (int):
The length of the filter.
hop_length (int):
The hop length.
win_length (int):
The window length.
n_mel_channels (int):
The number of mel channels.
mel_fmin (float):
The minimum frequency of the mel filterbank.
mel_fmax (Optional[float]):
The maximum frequency of the mel filterbank.
"""
max_wav_value: float = field(default=32768.0)
input_sample_rate: int = field(default=16000)
output_sample_rate: int = field(default=24000)
filter_length: int = field(default=1280)
hop_length: int = field(default=320)
win_length: int = field(default=1280)
n_mel_channels: int = field(default=80)
mel_fmin: float = field(default=0.0)
mel_fmax: Optional[float] = field(default=None)
@dataclass
class FreeVCArgs(Coqpit):
"""FreeVC model arguments
Args:
spec_channels (int):
The number of channels in the spectrogram.
inter_channels (int):
The number of channels in the intermediate layers.
hidden_channels (int):
The number of channels in the hidden layers.
filter_channels (int):
The number of channels in the filter layers.
n_heads (int):
The number of attention heads.
n_layers (int):
The number of layers.
kernel_size (int):
The size of the kernel.
p_dropout (float):
The dropout probability.
resblock (str):
The type of residual block.
resblock_kernel_sizes (List[int]):
The kernel sizes for the residual blocks.
resblock_dilation_sizes (List[List[int]]):
The dilation sizes for the residual blocks.
upsample_rates (List[int]):
The upsample rates.
upsample_initial_channel (int):
The number of channels in the initial upsample layer.
upsample_kernel_sizes (List[int]):
The kernel sizes for the upsample layers.
n_layers_q (int):
The number of layers in the quantization network.
use_spectral_norm (bool):
Whether to use spectral normalization.
gin_channels (int):
The number of channels in the global conditioning vector.
ssl_dim (int):
The dimension of the self-supervised learning embedding.
use_spk (bool):
Whether to use external speaker encoder.
"""
spec_channels: int = field(default=641)
inter_channels: int = field(default=192)
hidden_channels: int = field(default=192)
filter_channels: int = field(default=768)
n_heads: int = field(default=2)
n_layers: int = field(default=6)
kernel_size: int = field(default=3)
p_dropout: float = field(default=0.1)
resblock: str = field(default="1")
resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11])
resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2])
upsample_initial_channel: int = field(default=512)
upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
n_layers_q: int = field(default=3)
use_spectral_norm: bool = field(default=False)
gin_channels: int = field(default=256)
ssl_dim: int = field(default=1024)
use_spk: bool = field(default=False)
num_spks: int = field(default=0)
segment_size: int = field(default=8960)
@dataclass
class FreeVCConfig(BaseVCConfig):
"""Defines parameters for FreeVC End2End TTS model.
Args:
model (str):
Model name. Do not change unless you know what you are doing.
model_args (FreeVCArgs):
Model architecture arguments. Defaults to `FreeVCArgs()`.
audio (FreeVCAudioConfig):
Audio processing configuration. Defaults to `FreeVCAudioConfig()`.
grad_clip (List):
Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
lr_gen (float):
Initial learning rate for the generator. Defaults to 0.0002.
lr_disc (float):
Initial learning rate for the discriminator. Defaults to 0.0002.
lr_scheduler_gen (str):
Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
`ExponentialLR`.
lr_scheduler_gen_params (dict):
Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
lr_scheduler_disc (str):
Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
`ExponentialLR`.
lr_scheduler_disc_params (dict):
Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
scheduler_after_epoch (bool):
If true, step the schedulers after each epoch else after each step. Defaults to `False`.
optimizer (str):
Name of the optimizer to use with both the generator and the discriminator networks. One of the
`torch.optim.*`. Defaults to `AdamW`.
kl_loss_alpha (float):
Loss weight for KL loss. Defaults to 1.0.
disc_loss_alpha (float):
Loss weight for the discriminator loss. Defaults to 1.0.
gen_loss_alpha (float):
Loss weight for the generator loss. Defaults to 1.0.
feat_loss_alpha (float):
Loss weight for the feature matching loss. Defaults to 1.0.
mel_loss_alpha (float):
Loss weight for the mel loss. Defaults to 45.0.
return_wav (bool):
If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
compute_linear_spec (bool):
If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
use_weighted_sampler (bool):
If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
weighted_sampler_attrs (dict):
Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
by overweighting `root_path` by 2.0. Defaults to `{}`.
weighted_sampler_multipliers (dict):
Weight each unique value of a key returned by the formatter for weighted sampling.
For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
r (int):
Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
add_blank (bool):
If true, a blank token is added in between every character. Defaults to `True`.
test_sentences (List[List]):
List of sentences with speaker and language information to be used for testing.
language_ids_file (str):
Path to the language ids file.
use_language_embedding (bool):
If true, language embedding is used. Defaults to `False`.
Note:
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
Example:
>>> from TTS.vc.configs.freevc_config import FreeVCConfig
>>> config = FreeVCConfig()
"""
model: str = "freevc"
# model specific params
model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
# optimizer
# TODO with training support
# loss params
# TODO with training support
# data loader params
return_wav: bool = True
compute_linear_spec: bool = True
# sampler params
use_weighted_sampler: bool = False # TODO: move it to the base config
weighted_sampler_attrs: dict = field(default_factory=lambda: {})
weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
# overrides
r: int = 1 # DO NOT CHANGE
add_blank: bool = True
# multi-speaker settings
# use speaker embedding layer
num_speakers: int = 0
speakers_file: str = None
speaker_embedding_channels: int = 256
# use d-vectors
use_d_vector_file: bool = False
d_vector_file: List[str] = None
d_vector_dim: int = None
def __post_init__(self):
for key, val in self.model_args.items():
if hasattr(self, key):
self[key] = val

View File

@ -1,4 +1,3 @@
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple, Union
import librosa
@ -13,8 +12,8 @@ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
import TTS.vc.modules.freevc.commons as commons
import TTS.vc.modules.freevc.modules as modules
from TTS.tts.utils.speakers import SpeakerManager
from TTS.utils.io import load_fsspec, save_checkpoint
from TTS.vc.configs.shared_configs import BaseVCConfig
from TTS.utils.io import load_fsspec
from TTS.vc.configs.freevc_config import FreeVCConfig
from TTS.vc.models.base_vc import BaseVC
from TTS.vc.modules.freevc.commons import get_padding, init_weights
from TTS.vc.modules.freevc.mel_processing import mel_spectrogram_torch
@ -294,136 +293,6 @@ class SpeakerEncoder(torch.nn.Module):
return embed
@dataclass
class FreeVCAudioConfig(Coqpit):
"""Audio configuration
Args:
max_wav_value (float):
The maximum value of the waveform.
input_sample_rate (int):
The sampling rate of the input waveform.
output_sample_rate (int):
The sampling rate of the output waveform.
filter_length (int):
The length of the filter.
hop_length (int):
The hop length.
win_length (int):
The window length.
n_mel_channels (int):
The number of mel channels.
mel_fmin (float):
The minimum frequency of the mel filterbank.
mel_fmax (Optional[float]):
The maximum frequency of the mel filterbank.
"""
max_wav_value: float = field(default=32768.0)
input_sample_rate: int = field(default=16000)
output_sample_rate: int = field(default=24000)
filter_length: int = field(default=1280)
hop_length: int = field(default=320)
win_length: int = field(default=1280)
n_mel_channels: int = field(default=80)
mel_fmin: float = field(default=0.0)
mel_fmax: Optional[float] = field(default=None)
@dataclass
class FreeVCArgs(Coqpit):
"""FreeVC model arguments
Args:
spec_channels (int):
The number of channels in the spectrogram.
inter_channels (int):
The number of channels in the intermediate layers.
hidden_channels (int):
The number of channels in the hidden layers.
filter_channels (int):
The number of channels in the filter layers.
n_heads (int):
The number of attention heads.
n_layers (int):
The number of layers.
kernel_size (int):
The size of the kernel.
p_dropout (float):
The dropout probability.
resblock (str):
The type of residual block.
resblock_kernel_sizes (List[int]):
The kernel sizes for the residual blocks.
resblock_dilation_sizes (List[List[int]]):
The dilation sizes for the residual blocks.
upsample_rates (List[int]):
The upsample rates.
upsample_initial_channel (int):
The number of channels in the initial upsample layer.
upsample_kernel_sizes (List[int]):
The kernel sizes for the upsample layers.
n_layers_q (int):
The number of layers in the quantization network.
use_spectral_norm (bool):
Whether to use spectral normalization.
gin_channels (int):
The number of channels in the global conditioning vector.
ssl_dim (int):
The dimension of the self-supervised learning embedding.
use_spk (bool):
Whether to use external speaker encoder.
"""
spec_channels: int = field(default=641)
inter_channels: int = field(default=192)
hidden_channels: int = field(default=192)
filter_channels: int = field(default=768)
n_heads: int = field(default=2)
n_layers: int = field(default=6)
kernel_size: int = field(default=3)
p_dropout: float = field(default=0.1)
resblock: str = field(default="1")
resblock_kernel_sizes: List[int] = field(default_factory=lambda: [3, 7, 11])
resblock_dilation_sizes: List[List[int]] = field(default_factory=lambda: [[1, 3, 5], [1, 3, 5], [1, 3, 5]])
upsample_rates: List[int] = field(default_factory=lambda: [10, 8, 2, 2])
upsample_initial_channel: int = field(default=512)
upsample_kernel_sizes: List[int] = field(default_factory=lambda: [16, 16, 4, 4])
n_layers_q: int = field(default=3)
use_spectral_norm: bool = field(default=False)
gin_channels: int = field(default=256)
ssl_dim: int = field(default=1024)
use_spk: bool = field(default=False)
num_spks: int = field(default=0)
segment_size: int = field(default=8960)
class FreeVC(BaseVC):
"""
@ -677,7 +546,7 @@ class FreeVC(BaseVC):
...
@staticmethod
def init_from_config(config: "VitsConfig", samples: Union[List[List], List[Dict]] = None, verbose=True):
def init_from_config(config: FreeVCConfig, samples: Union[List[List], List[Dict]] = None, verbose=True):
model = FreeVC(config)
return model
@ -689,145 +558,3 @@ class FreeVC(BaseVC):
def train_step():
...
@dataclass
class FreeVCConfig(BaseVCConfig):
"""Defines parameters for FreeVC End2End TTS model.
Args:
model (str):
Model name. Do not change unless you know what you are doing.
model_args (FreeVCArgs):
Model architecture arguments. Defaults to `FreeVCArgs()`.
audio (FreeVCAudioConfig):
Audio processing configuration. Defaults to `FreeVCAudioConfig()`.
grad_clip (List):
Gradient clipping thresholds for each optimizer. Defaults to `[1000.0, 1000.0]`.
lr_gen (float):
Initial learning rate for the generator. Defaults to 0.0002.
lr_disc (float):
Initial learning rate for the discriminator. Defaults to 0.0002.
lr_scheduler_gen (str):
Name of the learning rate scheduler for the generator. One of the `torch.optim.lr_scheduler.*`. Defaults to
`ExponentialLR`.
lr_scheduler_gen_params (dict):
Parameters for the learning rate scheduler of the generator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
lr_scheduler_disc (str):
Name of the learning rate scheduler for the discriminator. One of the `torch.optim.lr_scheduler.*`. Defaults to
`ExponentialLR`.
lr_scheduler_disc_params (dict):
Parameters for the learning rate scheduler of the discriminator. Defaults to `{'gamma': 0.999875, "last_epoch":-1}`.
scheduler_after_epoch (bool):
If true, step the schedulers after each epoch else after each step. Defaults to `False`.
optimizer (str):
Name of the optimizer to use with both the generator and the discriminator networks. One of the
`torch.optim.*`. Defaults to `AdamW`.
kl_loss_alpha (float):
Loss weight for KL loss. Defaults to 1.0.
disc_loss_alpha (float):
Loss weight for the discriminator loss. Defaults to 1.0.
gen_loss_alpha (float):
Loss weight for the generator loss. Defaults to 1.0.
feat_loss_alpha (float):
Loss weight for the feature matching loss. Defaults to 1.0.
mel_loss_alpha (float):
Loss weight for the mel loss. Defaults to 45.0.
return_wav (bool):
If true, data loader returns the waveform as well as the other outputs. Do not change. Defaults to `True`.
compute_linear_spec (bool):
If true, the linear spectrogram is computed and returned alongside the mel output. Do not change. Defaults to `True`.
use_weighted_sampler (bool):
If true, use weighted sampler with bucketing for balancing samples between datasets used in training. Defaults to `False`.
weighted_sampler_attrs (dict):
Key retuned by the formatter to be used for weighted sampler. For example `{"root_path": 2.0, "speaker_name": 1.0}` sets sample probabilities
by overweighting `root_path` by 2.0. Defaults to `{}`.
weighted_sampler_multipliers (dict):
Weight each unique value of a key returned by the formatter for weighted sampling.
For example `{"root_path":{"/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-100/":1.0, "/raid/datasets/libritts-clean-16khz-bwe-coqui_44khz/LibriTTS/train-clean-360/": 0.5}`.
It will sample instances from `train-clean-100` 2 times more than `train-clean-360`. Defaults to `{}`.
r (int):
Number of spectrogram frames to be generated at a time. Do not change. Defaults to `1`.
add_blank (bool):
If true, a blank token is added in between every character. Defaults to `True`.
test_sentences (List[List]):
List of sentences with speaker and language information to be used for testing.
language_ids_file (str):
Path to the language ids file.
use_language_embedding (bool):
If true, language embedding is used. Defaults to `False`.
Note:
Check :class:`TTS.tts.configs.shared_configs.BaseTTSConfig` for the inherited parameters.
Example:
>>> from TTS.tts.configs.freevc_config import FreeVCConfig
>>> config = FreeVCConfig()
"""
model: str = "freevc"
# model specific params
model_args: FreeVCArgs = field(default_factory=FreeVCArgs)
audio: FreeVCAudioConfig = field(default_factory=FreeVCAudioConfig)
# optimizer
# TODO with training support
# loss params
# TODO with training support
# data loader params
return_wav: bool = True
compute_linear_spec: bool = True
# sampler params
use_weighted_sampler: bool = False # TODO: move it to the base config
weighted_sampler_attrs: dict = field(default_factory=lambda: {})
weighted_sampler_multipliers: dict = field(default_factory=lambda: {})
# overrides
r: int = 1 # DO NOT CHANGE
add_blank: bool = True
# multi-speaker settings
# use speaker embedding layer
num_speakers: int = 0
speakers_file: str = None
speaker_embedding_channels: int = 256
# use d-vectors
use_d_vector_file: bool = False
d_vector_file: List[str] = None
d_vector_dim: int = None
def __post_init__(self):
for key, val in self.model_args.items():
if hasattr(self, key):
self[key] = val

View File

@ -195,10 +195,10 @@ def _apply_D_loss(scores_fake, scores_real, loss_func):
if isinstance(scores_fake, list):
# multi-scale loss
for score_fake, score_real in zip(scores_fake, scores_real):
total_loss, real_loss, fake_loss = loss_func(score_fake=score_fake, score_real=score_real)
total_loss, real_loss_, fake_loss_ = loss_func(score_fake=score_fake, score_real=score_real)
loss += total_loss
real_loss += real_loss
fake_loss += fake_loss
real_loss += real_loss_
fake_loss += fake_loss_
# normalize loss values with number of scales (discriminators)
loss /= len(scores_fake)
real_loss /= len(scores_real)

View File

@ -124,7 +124,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
print(TTS().list_models())
# Init TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1").to(device)
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
# Run TTS
# ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language
@ -198,19 +198,12 @@ from TTS.api import CS_API
# Init 🐸 Coqui Studio API
# you can either set the API token as an environment variable `COQUI_STUDIO_TOKEN` or pass it as an argument.
# XTTS - Best quality and life-like speech in EN
# XTTS - Best quality and life-like speech in multiple languages. See https://docs.coqui.ai/reference/samples_xtts_create for supported languages.
api = CS_API(api_token=<token>, model="XTTS")
api.speakers # all the speakers are available with all the models.
api.list_speakers()
api.list_voices()
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
# XTTS-multilingual - Multilingual XTTS with [en, de, es, fr, it, pt, ...] (more langs coming soon)
api = CS_API(api_token=<token>, model="XTTS-multilingual")
api.speakers
api.list_speakers()
api.list_voices()
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", speed=1.5)
wav, sample_rate = api.tts(text="This is a test.", speaker=api.speakers[0].name, emotion="Happy", language="en", speed=1.5)
# V1 - Fast and lightweight TTS in EN with emotion control.
api = CS_API(api_token=<token>, model="V1")
@ -238,4 +231,4 @@ api.tts_with_vc_to_file(
speaker_wav="target/speaker.wav",
file_path="ouptut.wav"
)
```
```

View File

@ -24,8 +24,7 @@ a few tricks to make it faster and support streaming inference.
Current implementation only supports inference.
### Languages
As of now, XTTS-v2 supports 16 languages: English, Spanish, French, German, Italian, Portuguese,
Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese (Simplified), Japanese, Hungarian, Korean
As of now, XTTS-v2 supports 16 languages: English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Polish (pl), Turkish (tr), Russian (ru), Dutch (nl), Czech (cs), Arabic (ar), Chinese (zh-cn), Japanese (ja), Hungarian (hu) and Korean (ko).
Stay tuned as we continue to add support for more languages. If you have any language requests, please feel free to reach out.
@ -116,7 +115,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
model.cuda()
print("Computing speaker latents...")
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
print("Inference...")
out = model.inference(
@ -124,7 +123,6 @@ out = model.inference(
"en",
gpt_cond_latent,
speaker_embedding,
diffusion_conditioning,
temperature=0.7, # Add custom parameters here
)
torchaudio.save("xtts.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)
@ -153,7 +151,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
model.cuda()
print("Computing speaker latents...")
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
print("Inference...")
t0 = time.time()
@ -210,7 +208,7 @@ model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENI
model.cuda()
print("Computing speaker latents...")
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
print("Inference...")
out = model.inference(
@ -218,7 +216,6 @@ out = model.inference(
"en",
gpt_cond_latent,
speaker_embedding,
diffusion_conditioning,
temperature=0.7, # Add custom parameters here
)
torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 24000)

View File

@ -14,7 +14,6 @@ from TTS.utils.manage import ModelManager
MODELS_WITH_SEP_TESTS = [
"tts_models/multilingual/multi-dataset/bark",
"tts_models/en/multi-dataset/tortoise-v2",
"tts_models/multilingual/multi-dataset/xtts_v1",
"tts_models/multilingual/multi-dataset/xtts_v1.1",
"tts_models/multilingual/multi-dataset/xtts_v2",
]
@ -83,14 +82,14 @@ def test_xtts():
if use_gpu:
run_cli(
"yes | "
f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1 "
f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 "
f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True '
f'--speaker_wav "{speaker_wav}" --language_idx "en"'
)
else:
run_cli(
"yes | "
f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1 "
f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v1.1 "
f'--text "This is an example." --out_path "{output_path}" --progress_bar False '
f'--speaker_wav "{speaker_wav}" --language_idx "en"'
)
@ -104,7 +103,7 @@ def test_xtts_streaming():
speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
speaker_wav.append(speaker_wav_2)
model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1")
model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1.1")
config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))
model = Xtts.init_from_config(config)