Merge pull request #3156 from coqui-ai/dev

v0.20.1
This commit is contained in:
Eren Gölge 2023-11-07 14:18:00 +01:00 committed by GitHub
commit 063556abf4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 168 additions and 2042 deletions

View File

@ -10,34 +10,22 @@
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5" "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/hash.md5"
], ],
"model_hash": "6a09d1ad43896f06041ed8195956c9698f13b6189dc80f1c74bdc2b8e8d15324",
"default_vocoder": null, "default_vocoder": null,
"commit": "480a6cdf7", "commit": "480a6cdf7",
"license": "CPML", "license": "CPML",
"contact": "info@coqui.ai", "contact": "info@coqui.ai",
"tos_required": true "tos_required": true
}, },
"xtts_v1": {
"description": "XTTS-v1 by Coqui with 13 languages and cross-language voice cloning.",
"hf_url": [
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/model.pth",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/config.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/hifigan/vocab.json"
],
"default_vocoder": null,
"commit": "e5140314",
"license": "CPML",
"contact": "info@coqui.ai",
"tos_required": true
},
"xtts_v1.1": { "xtts_v1.1": {
"description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.", "description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
"hf_url": [ "hf_url": [
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/model.pth", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/config.json", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/config.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/vocab.json", "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json",
"https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/hash.md5" "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/hash.md5"
], ],
"model_hash": "ae9e4b39e095fd5728fe7f7931ec66ad", "model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
"default_vocoder": null, "default_vocoder": null,
"commit": "82910a63", "commit": "82910a63",
"license": "CPML", "license": "CPML",

View File

@ -1 +1 @@
0.20.0 0.20.1

View File

@ -30,7 +30,7 @@ class XttsConfig(BaseTTSConfig):
which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative), which in turn is used to divide the score of the sequence. Since the score is the log likelihood of the sequence (i.e. negative),
length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences. length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter sequences.
reperation_penalty (float): repetition_penalty (float):
The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`. The parameter for repetition penalty. 1.0 means no penalty. Defaults to `2.0`.
top_p (float): top_p (float):

File diff suppressed because it is too large Load Diff

View File

@ -2,13 +2,10 @@ import os
import random import random
import sys import sys
import numpy as np
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import torch.utils.data import torch.utils.data
import torchaudio from TTS.tts.models.xtts import load_audio
from torchaudio.backend.soundfile_backend import load as torchaudio_soundfile_load
from torchaudio.backend.sox_io_backend import load as torchaudio_sox_load
torch.set_num_threads(1) torch.set_num_threads(1)
@ -50,31 +47,6 @@ def get_prompt_slice(gt_path, max_sample_length, min_sample_length, sample_rate,
return rel_clip, rel_clip.shape[-1], cond_idxs return rel_clip, rel_clip.shape[-1], cond_idxs
def load_audio(audiopath, sampling_rate):
# better load setting following: https://github.com/faroit/python_audio_loading_benchmark
if audiopath[-4:] == ".mp3":
# it uses torchaudio with sox backend to load mp3
audio, lsr = torchaudio_sox_load(audiopath)
else:
# it uses torchaudio soundfile backend to load all the others data type
audio, lsr = torchaudio_soundfile_load(audiopath)
# stereo to mono if needed
if audio.size(0) != 1:
audio = torch.mean(audio, dim=0, keepdim=True)
if lsr != sampling_rate:
audio = torchaudio.functional.resample(audio, lsr, sampling_rate)
# Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
# '10' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
if torch.any(audio > 10) or not torch.any(audio < 0):
print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
# clip audio invalid values
audio.clip_(-1, 1)
return audio
class XTTSDataset(torch.utils.data.Dataset): class XTTSDataset(torch.utils.data.Dataset):
def __init__(self, config, samples, tokenizer, sample_rate, is_eval=False): def __init__(self, config, samples, tokenizer, sample_rate, is_eval=False):
self.config = config self.config = config

View File

@ -238,7 +238,6 @@ class GPTTrainer(BaseTTS):
s_info["speaker_wav"], s_info["speaker_wav"],
s_info["language"], s_info["language"],
gpt_cond_len=3, gpt_cond_len=3,
decoder="ne_hifigan",
)["wav"] )["wav"]
test_audios["{}-audio".format(idx)] = wav test_audios["{}-audio".format(idx)] = wav

View File

@ -1,385 +0,0 @@
import json
from dataclasses import dataclass
from enum import Enum
from typing import Callable, Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
MAX_WAV_VALUE = 32768.0
class KernelPredictor(torch.nn.Module):
"""Kernel predictor for the location-variable convolutions"""
def __init__(
self,
cond_channels,
conv_in_channels,
conv_out_channels,
conv_layers,
conv_kernel_size=3,
kpnet_hidden_channels=64,
kpnet_conv_size=3,
kpnet_dropout=0.0,
kpnet_nonlinear_activation="LeakyReLU",
kpnet_nonlinear_activation_params={"negative_slope": 0.1},
):
"""
Args:
cond_channels (int): number of channel for the conditioning sequence,
conv_in_channels (int): number of channel for the input sequence,
conv_out_channels (int): number of channel for the output sequence,
conv_layers (int): number of layers
"""
super().__init__()
self.conv_in_channels = conv_in_channels
self.conv_out_channels = conv_out_channels
self.conv_kernel_size = conv_kernel_size
self.conv_layers = conv_layers
kpnet_kernel_channels = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers # l_w
kpnet_bias_channels = conv_out_channels * conv_layers # l_b
self.input_conv = nn.Sequential(
nn.utils.weight_norm(nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=2, bias=True)),
getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
)
self.residual_convs = nn.ModuleList()
padding = (kpnet_conv_size - 1) // 2
for _ in range(3):
self.residual_convs.append(
nn.Sequential(
nn.Dropout(kpnet_dropout),
nn.utils.weight_norm(
nn.Conv1d(
kpnet_hidden_channels,
kpnet_hidden_channels,
kpnet_conv_size,
padding=padding,
bias=True,
)
),
getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
nn.utils.weight_norm(
nn.Conv1d(
kpnet_hidden_channels,
kpnet_hidden_channels,
kpnet_conv_size,
padding=padding,
bias=True,
)
),
getattr(nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
)
)
self.kernel_conv = nn.utils.weight_norm(
nn.Conv1d(
kpnet_hidden_channels,
kpnet_kernel_channels,
kpnet_conv_size,
padding=padding,
bias=True,
)
)
self.bias_conv = nn.utils.weight_norm(
nn.Conv1d(
kpnet_hidden_channels,
kpnet_bias_channels,
kpnet_conv_size,
padding=padding,
bias=True,
)
)
def forward(self, c):
"""
Args:
c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
"""
batch, _, cond_length = c.shape
c = self.input_conv(c)
for residual_conv in self.residual_convs:
residual_conv.to(c.device)
c = c + residual_conv(c)
k = self.kernel_conv(c)
b = self.bias_conv(c)
kernels = k.contiguous().view(
batch,
self.conv_layers,
self.conv_in_channels,
self.conv_out_channels,
self.conv_kernel_size,
cond_length,
)
bias = b.contiguous().view(
batch,
self.conv_layers,
self.conv_out_channels,
cond_length,
)
return kernels, bias
def remove_weight_norm(self):
nn.utils.remove_weight_norm(self.input_conv[0])
nn.utils.remove_weight_norm(self.kernel_conv)
nn.utils.remove_weight_norm(self.bias_conv)
for block in self.residual_convs:
nn.utils.remove_weight_norm(block[1])
nn.utils.remove_weight_norm(block[3])
class LVCBlock(torch.nn.Module):
"""the location-variable convolutions"""
def __init__(
self,
in_channels,
cond_channels,
stride,
dilations=[1, 3, 9, 27],
lReLU_slope=0.2,
conv_kernel_size=3,
cond_hop_length=256,
kpnet_hidden_channels=64,
kpnet_conv_size=3,
kpnet_dropout=0.0,
):
super().__init__()
self.cond_hop_length = cond_hop_length
self.conv_layers = len(dilations)
self.conv_kernel_size = conv_kernel_size
self.kernel_predictor = KernelPredictor(
cond_channels=cond_channels,
conv_in_channels=in_channels,
conv_out_channels=2 * in_channels,
conv_layers=len(dilations),
conv_kernel_size=conv_kernel_size,
kpnet_hidden_channels=kpnet_hidden_channels,
kpnet_conv_size=kpnet_conv_size,
kpnet_dropout=kpnet_dropout,
kpnet_nonlinear_activation_params={"negative_slope": lReLU_slope},
)
self.convt_pre = nn.Sequential(
nn.LeakyReLU(lReLU_slope),
nn.utils.weight_norm(
nn.ConvTranspose1d(
in_channels,
in_channels,
2 * stride,
stride=stride,
padding=stride // 2 + stride % 2,
output_padding=stride % 2,
)
),
)
self.conv_blocks = nn.ModuleList()
for dilation in dilations:
self.conv_blocks.append(
nn.Sequential(
nn.LeakyReLU(lReLU_slope),
nn.utils.weight_norm(
nn.Conv1d(
in_channels,
in_channels,
conv_kernel_size,
padding=dilation * (conv_kernel_size - 1) // 2,
dilation=dilation,
)
),
nn.LeakyReLU(lReLU_slope),
)
)
def forward(self, x, c):
"""forward propagation of the location-variable convolutions.
Args:
x (Tensor): the input sequence (batch, in_channels, in_length)
c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
Returns:
Tensor: the output sequence (batch, in_channels, in_length)
"""
_, in_channels, _ = x.shape # (B, c_g, L')
x = self.convt_pre(x) # (B, c_g, stride * L')
kernels, bias = self.kernel_predictor(c)
for i, conv in enumerate(self.conv_blocks):
output = conv(x) # (B, c_g, stride * L')
k = kernels[:, i, :, :, :, :] # (B, 2 * c_g, c_g, kernel_size, cond_length)
b = bias[:, i, :, :] # (B, 2 * c_g, cond_length)
output = self.location_variable_convolution(
output, k, b, hop_size=self.cond_hop_length
) # (B, 2 * c_g, stride * L'): LVC
x = x + torch.sigmoid(output[:, :in_channels, :]) * torch.tanh(
output[:, in_channels:, :]
) # (B, c_g, stride * L'): GAU
return x
def location_variable_convolution(self, x, kernel, bias, dilation=1, hop_size=256):
"""perform location-variable convolution operation on the input sequence (x) using the local convolution kernl.
Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
Args:
x (Tensor): the input sequence (batch, in_channels, in_length).
kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
dilation (int): the dilation of convolution.
hop_size (int): the hop_size of the conditioning sequence.
Returns:
(Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
"""
batch, _, in_length = x.shape
batch, _, out_channels, kernel_size, kernel_length = kernel.shape
assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched"
padding = dilation * int((kernel_size - 1) / 2)
x = F.pad(x, (padding, padding), "constant", 0) # (batch, in_channels, in_length + 2*padding)
x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding)
if hop_size < dilation:
x = F.pad(x, (0, dilation), "constant", 0)
x = x.unfold(
3, dilation, dilation
) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
x = x[:, :, :, :, :hop_size]
x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)
x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size)
o = torch.einsum("bildsk,biokl->bolsd", x, kernel)
o = o.to(memory_format=torch.channels_last_3d)
bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d)
o = o + bias
o = o.contiguous().view(batch, out_channels, -1)
return o
def remove_weight_norm(self):
self.kernel_predictor.remove_weight_norm()
nn.utils.remove_weight_norm(self.convt_pre[1])
for block in self.conv_blocks:
nn.utils.remove_weight_norm(block[1])
class UnivNetGenerator(nn.Module):
"""
UnivNet Generator
Originally from https://github.com/mindslab-ai/univnet/blob/master/model/generator.py.
"""
def __init__(
self,
noise_dim=64,
channel_size=32,
dilations=[1, 3, 9, 27],
strides=[8, 8, 4],
lReLU_slope=0.2,
kpnet_conv_size=3,
# Below are MEL configurations options that this generator requires.
hop_length=256,
n_mel_channels=100,
):
super(UnivNetGenerator, self).__init__()
self.mel_channel = n_mel_channels
self.noise_dim = noise_dim
self.hop_length = hop_length
channel_size = channel_size
kpnet_conv_size = kpnet_conv_size
self.res_stack = nn.ModuleList()
hop_length = 1
for stride in strides:
hop_length = stride * hop_length
self.res_stack.append(
LVCBlock(
channel_size,
n_mel_channels,
stride=stride,
dilations=dilations,
lReLU_slope=lReLU_slope,
cond_hop_length=hop_length,
kpnet_conv_size=kpnet_conv_size,
)
)
self.conv_pre = nn.utils.weight_norm(nn.Conv1d(noise_dim, channel_size, 7, padding=3, padding_mode="reflect"))
self.conv_post = nn.Sequential(
nn.LeakyReLU(lReLU_slope),
nn.utils.weight_norm(nn.Conv1d(channel_size, 1, 7, padding=3, padding_mode="reflect")),
nn.Tanh(),
)
def forward(self, c, z):
"""
Args:
c (Tensor): the conditioning sequence of mel-spectrogram (batch, mel_channels, in_length)
z (Tensor): the noise sequence (batch, noise_dim, in_length)
"""
z = self.conv_pre(z) # (B, c_g, L)
for res_block in self.res_stack:
res_block.to(z.device)
z = res_block(z, c) # (B, c_g, L * s_0 * ... * s_i)
z = self.conv_post(z) # (B, 1, L * 256)
return z
def eval(self, inference=False):
super(UnivNetGenerator, self).eval()
# don't remove weight norm while validation in training loop
if inference:
self.remove_weight_norm()
def remove_weight_norm(self):
nn.utils.remove_weight_norm(self.conv_pre)
for layer in self.conv_post:
if len(layer.state_dict()) != 0:
nn.utils.remove_weight_norm(layer)
for res_block in self.res_stack:
res_block.remove_weight_norm()
def inference(self, c, z=None):
# pad input mel with zeros to cut artifact
# see https://github.com/seungwonpark/melgan/issues/8
zero = torch.full((c.shape[0], self.mel_channel, 10), -11.5129).to(c.device)
mel = torch.cat((c, zero), dim=2)
if z is None:
z = torch.randn(c.shape[0], self.noise_dim, mel.size(2)).to(mel.device)
audio = self.forward(mel, z)
audio = audio[:, :, : -(self.hop_length * 10)]
audio = audio.clamp(min=-1, max=1)
return audio
if __name__ == "__main__":
model = UnivNetGenerator()
c = torch.randn(3, 100, 10)
z = torch.randn(3, 64, 10)
print(c.shape)
y = model(c, z)
print(y.shape)
assert y.shape == torch.Size([3, 1, 2560])
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(pytorch_total_params)

View File

@ -252,7 +252,12 @@ class BaseTacotron(BaseTTS):
def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None): def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None):
"""Capacitron Variational Autoencoder""" """Capacitron Variational Autoencoder"""
(VAE_outputs, posterior_distribution, prior_distribution, capacitron_beta,) = self.capacitron_vae_layer( (
VAE_outputs,
posterior_distribution,
prior_distribution,
capacitron_beta,
) = self.capacitron_vae_layer(
reference_mel_info, reference_mel_info,
text_info, text_info,
speaker_embedding, # pylint: disable=not-callable speaker_embedding, # pylint: disable=not-callable

View File

@ -676,7 +676,12 @@ class Tortoise(BaseTTS):
), "Too much text provided. Break the text up into separate segments and re-try inference." ), "Too much text provided. Break the text up into separate segments and re-try inference."
if voice_samples is not None: if voice_samples is not None:
(auto_conditioning, diffusion_conditioning, _, _,) = self.get_conditioning_latents( (
auto_conditioning,
diffusion_conditioning,
_,
_,
) = self.get_conditioning_latents(
voice_samples, voice_samples,
return_mels=True, return_mels=True,
latent_averaging_mode=latent_averaging_mode, latent_averaging_mode=latent_averaging_mode,

View File

@ -9,13 +9,10 @@ import torchaudio
from coqpit import Coqpit from coqpit import Coqpit
from TTS.tts.layers.tortoise.audio_utils import denormalize_tacotron_mel, wav_to_univnet_mel from TTS.tts.layers.tortoise.audio_utils import denormalize_tacotron_mel, wav_to_univnet_mel
from TTS.tts.layers.tortoise.diffusion_decoder import DiffusionTts
from TTS.tts.layers.xtts.diffusion import SpacedDiffusion, get_named_beta_schedule, space_timesteps
from TTS.tts.layers.xtts.gpt import GPT from TTS.tts.layers.xtts.gpt import GPT
from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder from TTS.tts.layers.xtts.hifigan_decoder import HifiDecoder
from TTS.tts.layers.xtts.stream_generator import init_stream_support from TTS.tts.layers.xtts.stream_generator import init_stream_support
from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer
from TTS.tts.layers.xtts.vocoder import UnivNetGenerator
from TTS.tts.models.base_tts import BaseTTS from TTS.tts.models.base_tts import BaseTTS
from TTS.utils.io import load_fsspec from TTS.utils.io import load_fsspec
@ -70,6 +67,31 @@ def wav_to_mel_cloning(
return mel return mel
def load_audio(audiopath, sampling_rate):
# better load setting following: https://github.com/faroit/python_audio_loading_benchmark
if audiopath[-4:] == ".mp3":
# it uses torchaudio with sox backend to load mp3
audio, lsr = torchaudio.backend.sox_io_backend.load(audiopath)
else:
# it uses torchaudio soundfile backend to load all the others data type
audio, lsr = torchaudio.backend.soundfile_backend.load(audiopath)
# stereo to mono if needed
if audio.size(0) != 1:
audio = torch.mean(audio, dim=0, keepdim=True)
if lsr != sampling_rate:
audio = torchaudio.functional.resample(audio, lsr, sampling_rate)
# Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
# '10' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
if torch.any(audio > 10) or not torch.any(audio < 0):
print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
# clip audio invalid values
audio.clip_(-1, 1)
return audio
def pad_or_truncate(t, length): def pad_or_truncate(t, length):
""" """
Ensure a given tensor t has a specified sequence length by either padding it with zeros or clipping it. Ensure a given tensor t has a specified sequence length by either padding it with zeros or clipping it.
@ -89,78 +111,6 @@ def pad_or_truncate(t, length):
return tp return tp
def load_discrete_vocoder_diffuser(
trained_diffusion_steps=4000,
desired_diffusion_steps=200,
cond_free=True,
cond_free_k=1,
sampler="ddim",
):
"""
Load a GaussianDiffusion instance configured for use as a decoder.
Args:
trained_diffusion_steps (int): The number of diffusion steps used during training.
desired_diffusion_steps (int): The number of diffusion steps to use during inference.
cond_free (bool): Whether to use a conditioning-free model.
cond_free_k (int): The number of samples to use for conditioning-free models.
sampler (str): The name of the sampler to use.
Returns:
A SpacedDiffusion instance configured with the given parameters.
"""
return SpacedDiffusion(
use_timesteps=space_timesteps(trained_diffusion_steps, [desired_diffusion_steps]),
model_mean_type="epsilon",
model_var_type="learned_range",
loss_type="mse",
betas=get_named_beta_schedule("linear", trained_diffusion_steps),
conditioning_free=cond_free,
conditioning_free_k=cond_free_k,
sampler=sampler,
)
def do_spectrogram_diffusion(
diffusion_model,
diffuser,
latents,
conditioning_latents,
temperature=1,
):
"""
Generate a mel-spectrogram using a diffusion model and a diffuser.
Args:
diffusion_model (nn.Module): A diffusion model that converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
diffuser (Diffuser): A diffuser that generates a mel-spectrogram from noise.
latents (torch.Tensor): A tensor of shape (batch_size, seq_len, code_size) containing the input spectrogram codes.
conditioning_latents (torch.Tensor): A tensor of shape (batch_size, code_size) containing the conditioning codes.
temperature (float, optional): The temperature of the noise used by the diffuser. Defaults to 1.
Returns:
torch.Tensor: A tensor of shape (batch_size, mel_channels, mel_seq_len) containing the generated mel-spectrogram.
"""
with torch.no_grad():
output_seq_len = (
latents.shape[1] * 4 * 24000 // 22050
) # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
output_shape = (latents.shape[0], 100, output_seq_len)
precomputed_embeddings = diffusion_model.timestep_independent(
latents, conditioning_latents, output_seq_len, False
)
noise = torch.randn(output_shape, device=latents.device) * temperature
mel = diffuser.sample_loop(
diffusion_model,
output_shape,
noise=noise,
model_kwargs={"precomputed_aligned_embeddings": precomputed_embeddings},
progress=False,
)
return denormalize_tacotron_mel(mel)[:, :, :output_seq_len]
@dataclass @dataclass
class XttsAudioConfig(Coqpit): class XttsAudioConfig(Coqpit):
""" """
@ -168,12 +118,10 @@ class XttsAudioConfig(Coqpit):
Args: Args:
sample_rate (int): The sample rate in which the GPT operates. sample_rate (int): The sample rate in which the GPT operates.
diffusion_sample_rate (int): The sample rate of the diffusion audio waveform.
output_sample_rate (int): The sample rate of the output audio waveform. output_sample_rate (int): The sample rate of the output audio waveform.
""" """
sample_rate: int = 22050 sample_rate: int = 22050
diffusion_sample_rate: int = 24000
output_sample_rate: int = 24000 output_sample_rate: int = 24000
@ -189,8 +137,6 @@ class XttsArgs(Coqpit):
clvp_checkpoint (str, optional): The checkpoint for the ConditionalLatentVariablePerseq model. Defaults to None. clvp_checkpoint (str, optional): The checkpoint for the ConditionalLatentVariablePerseq model. Defaults to None.
decoder_checkpoint (str, optional): The checkpoint for the DiffTTS model. Defaults to None. decoder_checkpoint (str, optional): The checkpoint for the DiffTTS model. Defaults to None.
num_chars (int, optional): The maximum number of characters to generate. Defaults to 255. num_chars (int, optional): The maximum number of characters to generate. Defaults to 255.
use_hifigan (bool, optional): Whether to use hifigan with implicit enhancement or diffusion + univnet as a decoder. Defaults to True.
use_ne_hifigan (bool, optional): Whether to use regular hifigan or diffusion + univnet as a decoder. Defaults to False.
For GPT model: For GPT model:
gpt_max_audio_tokens (int, optional): The maximum mel tokens for the autoregressive model. Defaults to 604. gpt_max_audio_tokens (int, optional): The maximum mel tokens for the autoregressive model. Defaults to 604.
@ -228,8 +174,6 @@ class XttsArgs(Coqpit):
clvp_checkpoint: str = None clvp_checkpoint: str = None
decoder_checkpoint: str = None decoder_checkpoint: str = None
num_chars: int = 255 num_chars: int = 255
use_hifigan: bool = True
use_ne_hifigan: bool = False
# XTTS GPT Encoder params # XTTS GPT Encoder params
tokenizer_file: str = "" tokenizer_file: str = ""
@ -326,7 +270,6 @@ class Xtts(BaseTTS):
code_stride_len=self.args.gpt_code_stride_len, code_stride_len=self.args.gpt_code_stride_len,
) )
if self.args.use_hifigan:
self.hifigan_decoder = HifiDecoder( self.hifigan_decoder = HifiDecoder(
input_sample_rate=self.args.input_sample_rate, input_sample_rate=self.args.input_sample_rate,
output_sample_rate=self.args.output_sample_rate, output_sample_rate=self.args.output_sample_rate,
@ -337,33 +280,6 @@ class Xtts(BaseTTS):
cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer, cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer,
) )
if self.args.use_ne_hifigan:
self.ne_hifigan_decoder = HifiDecoder(
input_sample_rate=self.args.input_sample_rate,
output_sample_rate=self.args.output_sample_rate,
output_hop_length=self.args.output_hop_length,
ar_mel_length_compression=self.args.gpt_code_stride_len,
decoder_input_dim=self.args.decoder_input_dim,
d_vector_dim=self.args.d_vector_dim,
cond_d_vector_in_each_upsampling_layer=self.args.cond_d_vector_in_each_upsampling_layer,
)
if not (self.args.use_hifigan or self.args.use_ne_hifigan):
self.diffusion_decoder = DiffusionTts(
model_channels=self.args.diff_model_channels,
num_layers=self.args.diff_num_layers,
in_channels=self.args.diff_in_channels,
out_channels=self.args.diff_out_channels,
in_latent_channels=self.args.diff_in_latent_channels,
in_tokens=self.args.diff_in_tokens,
dropout=self.args.diff_dropout,
use_fp16=self.args.diff_use_fp16,
num_heads=self.args.diff_num_heads,
layer_drop=self.args.diff_layer_drop,
unconditioned_percentage=self.args.diff_unconditioned_percentage,
)
self.vocoder = UnivNetGenerator()
@property @property
def device(self): def device(self):
return next(self.parameters()).device return next(self.parameters()).device
@ -373,7 +289,7 @@ class Xtts(BaseTTS):
"""Compute the conditioning latents for the GPT model from the given audio. """Compute the conditioning latents for the GPT model from the given audio.
Args: Args:
audio_path (str): Path to the audio file. audio (tensor): audio tensor.
sr (int): Sample rate of the audio. sr (int): Sample rate of the audio.
length (int): Length of the audio in seconds. Defaults to 3. length (int): Length of the audio in seconds. Defaults to 3.
""" """
@ -441,12 +357,21 @@ class Xtts(BaseTTS):
max_ref_length=10, max_ref_length=10,
librosa_trim_db=None, librosa_trim_db=None,
sound_norm_refs=False, sound_norm_refs=False,
load_sr=24000,
): ):
speaker_embedding = None # deal with multiples references
diffusion_cond_latents = None if not isinstance(audio_path, list):
audio_paths = [audio_path]
else:
audio_paths = audio_path
audio, sr = torchaudio.load(audio_path) speaker_embeddings = []
audio = audio[:, : sr * max_ref_length].to(self.device) audios = []
speaker_embedding = None
for file_path in audio_paths:
# load the audio in 24khz to avoid issued with multiple sr references
audio = load_audio(file_path, load_sr)
audio = audio[:, : load_sr * max_ref_length].to(self.device)
if audio.shape[0] > 1: if audio.shape[0] > 1:
audio = audio.mean(0, keepdim=True) audio = audio.mean(0, keepdim=True)
if sound_norm_refs: if sound_norm_refs:
@ -454,12 +379,20 @@ class Xtts(BaseTTS):
if librosa_trim_db is not None: if librosa_trim_db is not None:
audio = librosa.effects.trim(audio, top_db=librosa_trim_db)[0] audio = librosa.effects.trim(audio, top_db=librosa_trim_db)[0]
if self.args.use_hifigan or self.args.use_ne_hifigan: speaker_embedding = self.get_speaker_embedding(audio, load_sr)
speaker_embedding = self.get_speaker_embedding(audio, sr) speaker_embeddings.append(speaker_embedding)
else:
diffusion_cond_latents = self.get_diffusion_cond_latents(audio, sr) audios.append(audio)
gpt_cond_latents = self.get_gpt_cond_latents(audio, sr, length=gpt_cond_len) # [1, 1024, T]
return gpt_cond_latents, diffusion_cond_latents, speaker_embedding # use a merge of all references for gpt cond latents
full_audio = torch.cat(audios, dim=-1)
gpt_cond_latents = self.get_gpt_cond_latents(full_audio, load_sr, length=gpt_cond_len) # [1, 1024, T]
if speaker_embeddings:
speaker_embedding = torch.stack(speaker_embeddings)
speaker_embedding = speaker_embedding.mean(dim=0)
return gpt_cond_latents, speaker_embedding
def synthesize(self, text, config, speaker_wav, language, **kwargs): def synthesize(self, text, config, speaker_wav, language, **kwargs):
"""Synthesize speech with the given input text. """Synthesize speech with the given input text.
@ -467,7 +400,7 @@ class Xtts(BaseTTS):
Args: Args:
text (str): Input text. text (str): Input text.
config (XttsConfig): Config with inference parameters. config (XttsConfig): Config with inference parameters.
speaker_wav (str): Path to the speaker audio file for cloning. speaker_wav (list): List of paths to the speaker audio files to be used for cloning.
language (str): Language ID of the speaker. language (str): Language ID of the speaker.
**kwargs: Inference settings. See `inference()`. **kwargs: Inference settings. See `inference()`.
@ -477,11 +410,6 @@ class Xtts(BaseTTS):
as latents used at inference. as latents used at inference.
""" """
# Make the synthesizer happy 🥳
if isinstance(speaker_wav, list):
speaker_wav = speaker_wav[0]
return self.inference_with_config(text, config, ref_audio_path=speaker_wav, language=language, **kwargs) return self.inference_with_config(text, config, ref_audio_path=speaker_wav, language=language, **kwargs)
def inference_with_config(self, text, config, ref_audio_path, language, **kwargs): def inference_with_config(self, text, config, ref_audio_path, language, **kwargs):
@ -563,27 +491,6 @@ class Xtts(BaseTTS):
gpt_cond_len: (int) Length of the audio used for cloning. If audio is shorter, then audio length is used gpt_cond_len: (int) Length of the audio used for cloning. If audio is shorter, then audio length is used
else the first `gpt_cond_len` secs is used. Defaults to 6 seconds. else the first `gpt_cond_len` secs is used. Defaults to 6 seconds.
decoder_iterations: (int) Number of diffusion steps to perform. [0,4000]. More steps means the network has
more chances to iteratively refine the output, which should theoretically mean a higher quality output.
Generally a value above 250 is not noticeably better, however. Defaults to 100.
cond_free: (bool) Whether or not to perform conditioning-free diffusion. Conditioning-free diffusion
performs two forward passes for each diffusion step: one with the outputs of the autoregressive model
and one with no conditioning priors. The output of the two is blended according to the cond_free_k
value below. Conditioning-free diffusion is the real deal, and dramatically improves realism.
Defaults to True.
cond_free_k: (float) Knob that determines how to balance the conditioning free signal with the
conditioning-present signal. [0,inf]. As cond_free_k increases, the output becomes dominated by the
conditioning-free signal. Defaults to 2.0.
diffusion_temperature: (float) Controls the variance of the noise fed into the diffusion model. [0,1].
Values at 0 re the "mean" prediction of the diffusion network and will sound bland and smeared.
Defaults to 1.0.
decoder: (str) Selects the decoder to use between ("hifigan", "ne_hifigan" and "diffusion")
Defaults to hifigan
hf_generate_kwargs: (**kwargs) The huggingface Transformers generate API is used for the autoregressive hf_generate_kwargs: (**kwargs) The huggingface Transformers generate API is used for the autoregressive
transformer. Extra keyword args fed to this function get forwarded directly to that API. Documentation transformer. Extra keyword args fed to this function get forwarded directly to that API. Documentation
here: https://huggingface.co/docs/transformers/internal/generation_utils here: https://huggingface.co/docs/transformers/internal/generation_utils
@ -592,7 +499,7 @@ class Xtts(BaseTTS):
Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length. Generated audio clip(s) as a torch tensor. Shape 1,S if k=1 else, (k,1,S) where S is the sample length.
Sample rate is 24kHz. Sample rate is 24kHz.
""" """
(gpt_cond_latent, diffusion_conditioning, speaker_embedding) = self.get_conditioning_latents( (gpt_cond_latent, speaker_embedding) = self.get_conditioning_latents(
audio_path=ref_audio_path, audio_path=ref_audio_path,
gpt_cond_len=gpt_cond_len, gpt_cond_len=gpt_cond_len,
max_ref_length=max_ref_len, max_ref_length=max_ref_len,
@ -604,19 +511,12 @@ class Xtts(BaseTTS):
language, language,
gpt_cond_latent, gpt_cond_latent,
speaker_embedding, speaker_embedding,
diffusion_conditioning,
temperature=temperature, temperature=temperature,
length_penalty=length_penalty, length_penalty=length_penalty,
repetition_penalty=repetition_penalty, repetition_penalty=repetition_penalty,
top_k=top_k, top_k=top_k,
top_p=top_p, top_p=top_p,
do_sample=do_sample, do_sample=do_sample,
decoder_iterations=decoder_iterations,
cond_free=cond_free,
cond_free_k=cond_free_k,
diffusion_temperature=diffusion_temperature,
decoder_sampler=decoder_sampler,
decoder=decoder,
**hf_generate_kwargs, **hf_generate_kwargs,
) )
@ -627,7 +527,6 @@ class Xtts(BaseTTS):
language, language,
gpt_cond_latent, gpt_cond_latent,
speaker_embedding, speaker_embedding,
diffusion_conditioning,
# GPT inference # GPT inference
temperature=0.65, temperature=0.65,
length_penalty=1, length_penalty=1,
@ -635,13 +534,6 @@ class Xtts(BaseTTS):
top_k=50, top_k=50,
top_p=0.85, top_p=0.85,
do_sample=True, do_sample=True,
# Decoder inference
decoder_iterations=100,
cond_free=True,
cond_free_k=2,
diffusion_temperature=1.0,
decoder_sampler="ddim",
decoder="hifigan",
num_beams=1, num_beams=1,
**hf_generate_kwargs, **hf_generate_kwargs,
): ):
@ -656,14 +548,6 @@ class Xtts(BaseTTS):
text_tokens.shape[-1] < self.args.gpt_max_text_tokens text_tokens.shape[-1] < self.args.gpt_max_text_tokens
), " ❗ XTTS can only generate text with a maximum of 400 tokens." ), " ❗ XTTS can only generate text with a maximum of 400 tokens."
if not self.args.use_hifigan:
diffuser = load_discrete_vocoder_diffuser(
desired_diffusion_steps=decoder_iterations,
cond_free=cond_free,
cond_free_k=cond_free_k,
sampler=decoder_sampler,
)
with torch.no_grad(): with torch.no_grad():
gpt_codes = self.gpt.generate( gpt_codes = self.gpt.generate(
cond_latents=gpt_cond_latent, cond_latents=gpt_cond_latent,
@ -705,34 +589,12 @@ class Xtts(BaseTTS):
gpt_latents = gpt_latents[:, :k] gpt_latents = gpt_latents[:, :k]
break break
if decoder == "hifigan":
assert hasattr(
self, "hifigan_decoder"
), "You must enable hifigan decoder to use it by setting config `use_hifigan: true`"
wav = self.hifigan_decoder(gpt_latents, g=speaker_embedding) wav = self.hifigan_decoder(gpt_latents, g=speaker_embedding)
elif decoder == "ne_hifigan":
assert hasattr(
self, "ne_hifigan_decoder"
), "You must enable ne_hifigan decoder to use it by setting config `use_ne_hifigan: true`"
wav = self.ne_hifigan_decoder(gpt_latents, g=speaker_embedding)
else:
assert hasattr(
self, "diffusion_decoder"
), "You must disable hifigan decoders to use difffusion by setting config `use_ne_hifigan: false` and `use_hifigan: false`"
mel = do_spectrogram_diffusion(
self.diffusion_decoder,
diffuser,
gpt_latents,
diffusion_conditioning,
temperature=diffusion_temperature,
)
wav = self.vocoder.inference(mel)
return { return {
"wav": wav.cpu().numpy().squeeze(), "wav": wav.cpu().numpy().squeeze(),
"gpt_latents": gpt_latents, "gpt_latents": gpt_latents,
"speaker_embedding": speaker_embedding, "speaker_embedding": speaker_embedding,
"diffusion_conditioning": diffusion_conditioning,
} }
def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len): def handle_chunks(self, wav_gen, wav_gen_prev, wav_overlap, overlap_len):
@ -766,13 +628,8 @@ class Xtts(BaseTTS):
top_k=50, top_k=50,
top_p=0.85, top_p=0.85,
do_sample=True, do_sample=True,
# Decoder inference
decoder="hifigan",
**hf_generate_kwargs, **hf_generate_kwargs,
): ):
assert hasattr(
self, "hifigan_decoder"
), "`inference_stream` requires use_hifigan to be set to true in the config.model_args, diffusion is too slow to stream."
text = text.strip().lower() text = text.strip().lower()
text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device) text_tokens = torch.IntTensor(self.tokenizer.encode(text, lang=language)).unsqueeze(0).to(self.device)
@ -811,18 +668,7 @@ class Xtts(BaseTTS):
if is_end or (stream_chunk_size > 0 and len(last_tokens) >= stream_chunk_size): if is_end or (stream_chunk_size > 0 and len(last_tokens) >= stream_chunk_size):
gpt_latents = torch.cat(all_latents, dim=0)[None, :] gpt_latents = torch.cat(all_latents, dim=0)[None, :]
if decoder == "hifigan":
assert hasattr(
self, "hifigan_decoder"
), "You must enable hifigan decoder to use it by setting config `use_hifigan: true`"
wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device)) wav_gen = self.hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
elif decoder == "ne_hifigan":
assert hasattr(
self, "ne_hifigan_decoder"
), "You must enable ne_hifigan decoder to use it by setting config `use_ne_hifigan: true`"
wav_gen = self.ne_hifigan_decoder(gpt_latents, g=speaker_embedding.to(self.device))
else:
raise NotImplementedError("Diffusion for streaming inference not implemented.")
wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks( wav_chunk, wav_gen_prev, wav_overlap = self.handle_chunks(
wav_gen.squeeze(), wav_gen_prev, wav_overlap, overlap_wav_len wav_gen.squeeze(), wav_gen_prev, wav_overlap, overlap_wav_len
) )
@ -850,11 +696,8 @@ class Xtts(BaseTTS):
def get_compatible_checkpoint_state_dict(self, model_path): def get_compatible_checkpoint_state_dict(self, model_path):
checkpoint = load_fsspec(model_path, map_location=torch.device("cpu"))["model"] checkpoint = load_fsspec(model_path, map_location=torch.device("cpu"))["model"]
ignore_keys = ["diffusion_decoder", "vocoder"] if self.args.use_hifigan or self.args.use_ne_hifigan else []
ignore_keys += [] if self.args.use_hifigan else ["hifigan_decoder"]
ignore_keys += [] if self.args.use_ne_hifigan else ["ne_hifigan_decoder"]
# remove xtts gpt trainer extra keys # remove xtts gpt trainer extra keys
ignore_keys += ["torch_mel_spectrogram_style_encoder", "torch_mel_spectrogram_dvae", "dvae"] ignore_keys = ["torch_mel_spectrogram_style_encoder", "torch_mel_spectrogram_dvae", "dvae"]
for key in list(checkpoint.keys()): for key in list(checkpoint.keys()):
# check if it is from the coqui Trainer if so convert it # check if it is from the coqui Trainer if so convert it
if key.startswith("xtts."): if key.startswith("xtts."):
@ -913,14 +756,7 @@ class Xtts(BaseTTS):
self.load_state_dict(checkpoint, strict=strict) self.load_state_dict(checkpoint, strict=strict)
if eval: if eval:
if hasattr(self, "hifigan_decoder"):
self.hifigan_decoder.eval() self.hifigan_decoder.eval()
if hasattr(self, "ne_hifigan_decoder"):
self.hifigan_decoder.eval()
if hasattr(self, "diffusion_decoder"):
self.diffusion_decoder.eval()
if hasattr(self, "vocoder"):
self.vocoder.eval()
self.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache, use_deepspeed=use_deepspeed) self.gpt.init_gpt_for_inference(kv_cache=self.args.kv_cache, use_deepspeed=use_deepspeed)
self.gpt.eval() self.gpt.eval()

View File

@ -39,6 +39,7 @@ You can also mail us at info@coqui.ai.
### Inference ### Inference
#### 🐸TTS API #### 🐸TTS API
##### Single reference
```python ```python
from TTS.api import TTS from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True) tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
@ -46,12 +47,25 @@ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
# generate speech by cloning a voice using default settings # generate speech by cloning a voice using default settings
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.", tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
file_path="output.wav", file_path="output.wav",
speaker_wav="/path/to/target/speaker.wav", speaker_wav=["/path/to/target/speaker.wav"],
language="en")
```
##### Multiple references
```python
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)
# generate speech by cloning a voice using default settings
tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
file_path="output.wav",
speaker_wav=["/path/to/target/speaker.wav", "/path/to/target/speaker_2.wav", "/path/to/target/speaker_3.wav"],
language="en") language="en")
``` ```
#### 🐸TTS Command line #### 🐸TTS Command line
##### Single reference
```console ```console
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \ tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
--text "Bugün okula gitmek istemiyorum." \ --text "Bugün okula gitmek istemiyorum." \
@ -60,6 +74,25 @@ tts.tts_to_file(text="It took me quite a long time to develop a voice, and now t
--use_cuda true --use_cuda true
``` ```
##### Multiple references
```console
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
--text "Bugün okula gitmek istemiyorum." \
--speaker_wav /path/to/target/speaker.wav /path/to/target/speaker_2.wav /path/to/target/speaker_3.wav \
--language_idx tr \
--use_cuda true
```
or for all wav files in a directory you can use:
```console
tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
--text "Bugün okula gitmek istemiyorum." \
--speaker_wav /path/to/target/*.wav \
--language_idx tr \
--use_cuda true
```
#### model directly #### model directly
If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first. If you want to be able to run with `use_deepspeed=True` and enjoy the speedup, you need to install deepspeed first.
@ -83,7 +116,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
model.cuda() model.cuda()
print("Computing speaker latents...") print("Computing speaker latents...")
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path="reference.wav") gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
print("Inference...") print("Inference...")
out = model.inference( out = model.inference(
@ -120,7 +153,7 @@ model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=Tru
model.cuda() model.cuda()
print("Computing speaker latents...") print("Computing speaker latents...")
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path="reference.wav") gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
print("Inference...") print("Inference...")
t0 = time.time() t0 = time.time()
@ -177,7 +210,7 @@ model.load_checkpoint(config, checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENI
model.cuda() model.cuda()
print("Computing speaker latents...") print("Computing speaker latents...")
gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=SPEAKER_REFERENCE) gpt_cond_latent, diffusion_conditioning, speaker_embedding = model.get_conditioning_latents(audio_path=[SPEAKER_REFERENCE])
print("Inference...") print("Inference...")
out = model.inference( out = model.inference(

View File

@ -41,8 +41,8 @@ os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
# DVAE files # DVAE files
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/dvae.pth" DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/dvae.pth"
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/mel_stats.pth" MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/mel_stats.pth"
# Set the path to the downloaded files # Set the path to the downloaded files
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, DVAE_CHECKPOINT_LINK.split("/")[-1]) DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, DVAE_CHECKPOINT_LINK.split("/")[-1])
@ -55,8 +55,8 @@ if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
# Download XTTS v1.1 checkpoint if needed # Download XTTS v1.1 checkpoint if needed
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/vocab.json" TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/vocab.json"
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/model.pth" XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.2/model.pth"
# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, TOKENIZER_FILE_LINK.split("/")[-1]) # vocab.json file TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, TOKENIZER_FILE_LINK.split("/")[-1]) # vocab.json file
@ -71,9 +71,9 @@ if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
# Training sentences generations # Training sentences generations
SPEAKER_REFERENCE = ( SPEAKER_REFERENCE = [
"./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
) ]
LANGUAGE = config_dataset.language LANGUAGE = config_dataset.language
@ -94,12 +94,9 @@ def main():
gpt_num_audio_tokens=8194, gpt_num_audio_tokens=8194,
gpt_start_audio_token=8192, gpt_start_audio_token=8192,
gpt_stop_audio_token=8193, gpt_stop_audio_token=8193,
use_ne_hifigan=True, # if it is true it will keep the non-enhanced keys on the output checkpoint
) )
# define audio config # define audio config
audio_config = XttsAudioConfig( audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
)
# training parameters config # training parameters config
config = GPTTrainerConfig( config = GPTTrainerConfig(
output_path=OUT_PATH, output_path=OUT_PATH,

View File

@ -41,27 +41,26 @@ os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
# DVAE files # DVAE files
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/dvae.pth" DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v1.1.1/mel_stats.pth" MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
# Set the path to the downloaded files # Set the path to the downloaded files
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, DVAE_CHECKPOINT_LINK.split("/")[-1]) DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, MEL_NORM_LINK.split("/")[-1]) MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
# download DVAE files if needed # download DVAE files if needed
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE): if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
print(" > Downloading DVAE files!") print(" > Downloading DVAE files!")
ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True) ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
# ToDo: Update links for XTTS v2.0
# Download XTTS v2.0 checkpoint if needed # Download XTTS v2.0 checkpoint if needed
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v2.0/vocab.json" TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v1/v2.0/model.pth" XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning. # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, TOKENIZER_FILE_LINK.split("/")[-1]) # vocab.json file TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, XTTS_CHECKPOINT_LINK.split("/")[-1]) # model.pth file XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
# download XTTS v2.0 files if needed # download XTTS v2.0 files if needed
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT): if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
@ -72,9 +71,9 @@ if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
# Training sentences generations # Training sentences generations
SPEAKER_REFERENCE = ( SPEAKER_REFERENCE = [
"./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences "./tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences
) ]
LANGUAGE = config_dataset.language LANGUAGE = config_dataset.language
@ -90,17 +89,14 @@ def main():
dvae_checkpoint=DVAE_CHECKPOINT, dvae_checkpoint=DVAE_CHECKPOINT,
xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
tokenizer_file=TOKENIZER_FILE, tokenizer_file=TOKENIZER_FILE,
gpt_num_audio_tokens=8194, gpt_num_audio_tokens=1026,
gpt_start_audio_token=8192, gpt_start_audio_token=1024,
gpt_stop_audio_token=8193, gpt_stop_audio_token=1025,
use_ne_hifigan=True, # if it is true it will keep the non-enhanced keys on the output checkpoint
gpt_use_masking_gt_prompt_approach=True, gpt_use_masking_gt_prompt_approach=True,
gpt_use_perceiver_resampler=True, gpt_use_perceiver_resampler=True,
) )
# define audio config # define audio config
audio_config = XttsAudioConfig( audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
)
# training parameters config # training parameters config
config = GPTTrainerConfig( config = GPTTrainerConfig(
output_path=OUT_PATH, output_path=OUT_PATH,

View File

@ -60,7 +60,7 @@ XTTS_CHECKPOINT = None # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_s
# Training sentences generations # Training sentences generations
SPEAKER_REFERENCE = "tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences SPEAKER_REFERENCE = ["tests/data/ljspeech/wavs/LJ001-0002.wav"] # speaker reference to be used in training test sentences
LANGUAGE = config_dataset.language LANGUAGE = config_dataset.language
@ -86,11 +86,8 @@ model_args = GPTArgs(
gpt_num_audio_tokens=8194, gpt_num_audio_tokens=8194,
gpt_start_audio_token=8192, gpt_start_audio_token=8192,
gpt_stop_audio_token=8193, gpt_stop_audio_token=8193,
use_ne_hifigan=True,
)
audio_config = XttsAudioConfig(
sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
) )
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
config = GPTTrainerConfig( config = GPTTrainerConfig(
epochs=1, epochs=1,
output_path=OUT_PATH, output_path=OUT_PATH,

View File

@ -58,7 +58,7 @@ XTTS_CHECKPOINT = None # "/raid/edresson/dev/Checkpoints/XTTS_evaluation/xtts_s
# Training sentences generations # Training sentences generations
SPEAKER_REFERENCE = "tests/data/ljspeech/wavs/LJ001-0002.wav" # speaker reference to be used in training test sentences SPEAKER_REFERENCE = ["tests/data/ljspeech/wavs/LJ001-0002.wav"] # speaker reference to be used in training test sentences
LANGUAGE = config_dataset.language LANGUAGE = config_dataset.language
@ -86,11 +86,10 @@ model_args = GPTArgs(
gpt_stop_audio_token=8193, gpt_stop_audio_token=8193,
gpt_use_masking_gt_prompt_approach=True, gpt_use_masking_gt_prompt_approach=True,
gpt_use_perceiver_resampler=True, gpt_use_perceiver_resampler=True,
use_ne_hifigan=True,
)
audio_config = XttsAudioConfig(
sample_rate=22050, dvae_sample_rate=22050, diffusion_sample_rate=24000, output_sample_rate=24000
) )
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
config = GPTTrainerConfig( config = GPTTrainerConfig(
epochs=1, epochs=1,
output_path=OUT_PATH, output_path=OUT_PATH,

View File

@ -101,7 +101,9 @@ def test_xtts_streaming():
from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts from TTS.tts.models.xtts import Xtts
speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
speaker_wav.append(speaker_wav_2)
model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1") model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1")
config = XttsConfig() config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json")) config.load_json(os.path.join(model_path, "config.json"))
@ -131,20 +133,21 @@ def test_xtts_v2():
"""XTTS is too big to run on github actions. We need to test it locally""" """XTTS is too big to run on github actions. We need to test it locally"""
output_path = os.path.join(get_tests_output_path(), "output.wav") output_path = os.path.join(get_tests_output_path(), "output.wav")
speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")
speaker_wav_2 = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0002.wav")
use_gpu = torch.cuda.is_available() use_gpu = torch.cuda.is_available()
if use_gpu: if use_gpu:
run_cli( run_cli(
"yes | " "yes | "
f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 "
f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True ' f'--text "This is an example." --out_path "{output_path}" --progress_bar False --use_cuda True '
f'--speaker_wav "{speaker_wav}" --language_idx "en"' f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" "--language_idx "en"'
) )
else: else:
run_cli( run_cli(
"yes | " "yes | "
f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 " f"tts --model_name tts_models/multilingual/multi-dataset/xtts_v2 "
f'--text "This is an example." --out_path "{output_path}" --progress_bar False ' f'--text "This is an example." --out_path "{output_path}" --progress_bar False '
f'--speaker_wav "{speaker_wav}" --language_idx "en"' f'--speaker_wav "{speaker_wav}" "{speaker_wav_2}" --language_idx "en"'
) )
@ -153,7 +156,7 @@ def test_xtts_v2_streaming():
from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts from TTS.tts.models.xtts import Xtts
speaker_wav = os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav") speaker_wav = [os.path.join(get_tests_data_path(), "ljspeech", "wavs", "LJ001-0001.wav")]
model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2") model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v2")
config = XttsConfig() config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json")) config.load_json(os.path.join(model_path, "config.json"))