mirror of https://github.com/coqui-ai/TTS.git
parent
c7184dcef9
commit
914280a556
|
@ -1 +1 @@
|
||||||
0.10.2
|
0.11.0
|
|
@ -10,7 +10,6 @@ from TTS.tts.utils.speakers import SpeakerManager
|
||||||
|
|
||||||
|
|
||||||
def compute_encoder_accuracy(dataset_items, encoder_manager):
|
def compute_encoder_accuracy(dataset_items, encoder_manager):
|
||||||
|
|
||||||
class_name_key = encoder_manager.encoder_config.class_name_key
|
class_name_key = encoder_manager.encoder_config.class_name_key
|
||||||
map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
|
map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
|
||||||
|
|
||||||
|
|
|
@ -164,7 +164,6 @@ def extract_spectrograms(
|
||||||
model.eval()
|
model.eval()
|
||||||
export_metadata = []
|
export_metadata = []
|
||||||
for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
|
for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
|
||||||
|
|
||||||
# format data
|
# format data
|
||||||
(
|
(
|
||||||
text_input,
|
text_input,
|
||||||
|
|
|
@ -35,7 +35,6 @@ def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="""Resample a folder recusively with librosa
|
description="""Resample a folder recusively with librosa
|
||||||
Can be used in place or create a copy of the folder as an output.\n\n
|
Can be used in place or create a copy of the folder as an output.\n\n
|
||||||
|
|
|
@ -14,7 +14,6 @@ from TTS.utils.io import save_fsspec
|
||||||
|
|
||||||
class AugmentWAV(object):
|
class AugmentWAV(object):
|
||||||
def __init__(self, ap, augmentation_config):
|
def __init__(self, ap, augmentation_config):
|
||||||
|
|
||||||
self.ap = ap
|
self.ap = ap
|
||||||
self.use_additive_noise = False
|
self.use_additive_noise = False
|
||||||
|
|
||||||
|
@ -67,7 +66,6 @@ class AugmentWAV(object):
|
||||||
self.global_noise_list.append("RIR_AUG")
|
self.global_noise_list.append("RIR_AUG")
|
||||||
|
|
||||||
def additive_noise(self, noise_type, audio):
|
def additive_noise(self, noise_type, audio):
|
||||||
|
|
||||||
clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
|
clean_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
|
||||||
|
|
||||||
noise_list = random.sample(
|
noise_list = random.sample(
|
||||||
|
|
|
@ -411,7 +411,6 @@ class TTSDataset(Dataset):
|
||||||
|
|
||||||
# Puts each data field into a tensor with outer dimension batch size
|
# Puts each data field into a tensor with outer dimension batch size
|
||||||
if isinstance(batch[0], collections.abc.Mapping):
|
if isinstance(batch[0], collections.abc.Mapping):
|
||||||
|
|
||||||
token_ids_lengths = np.array([len(d["token_ids"]) for d in batch])
|
token_ids_lengths = np.array([len(d["token_ids"]) for d in batch])
|
||||||
|
|
||||||
# sort items with text input length for RNN efficiency
|
# sort items with text input length for RNN efficiency
|
||||||
|
|
|
@ -81,7 +81,6 @@ class RelativePositionTransformerDecoder(nn.Module):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, in_channels, out_channels, hidden_channels, params):
|
def __init__(self, in_channels, out_channels, hidden_channels, params):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.prenet = Conv1dBN(in_channels, hidden_channels, 1, 1)
|
self.prenet = Conv1dBN(in_channels, hidden_channels, 1, 1)
|
||||||
self.rel_pos_transformer = RelativePositionTransformer(in_channels, out_channels, hidden_channels, **params)
|
self.rel_pos_transformer = RelativePositionTransformer(in_channels, out_channels, hidden_channels, **params)
|
||||||
|
@ -111,7 +110,6 @@ class FFTransformerDecoder(nn.Module):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, in_channels, out_channels, params):
|
def __init__(self, in_channels, out_channels, params):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.transformer_block = FFTransformerBlock(in_channels, **params)
|
self.transformer_block = FFTransformerBlock(in_channels, **params)
|
||||||
self.postnet = nn.Conv1d(in_channels, out_channels, 1)
|
self.postnet = nn.Conv1d(in_channels, out_channels, 1)
|
||||||
|
|
|
@ -18,7 +18,6 @@ class DurationPredictor(nn.Module):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, hidden_channels):
|
def __init__(self, hidden_channels):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.layers = nn.ModuleList(
|
self.layers = nn.ModuleList(
|
||||||
|
|
|
@ -100,7 +100,6 @@ class ResidualConv1dBNBlock(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, in_channels, out_channels, hidden_channels, kernel_size, dilations, num_res_blocks=13, num_conv_blocks=2
|
self, in_channels, out_channels, hidden_channels, kernel_size, dilations, num_res_blocks=13, num_conv_blocks=2
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
assert len(dilations) == num_res_blocks
|
assert len(dilations) == num_res_blocks
|
||||||
self.res_blocks = nn.ModuleList()
|
self.res_blocks = nn.ModuleList()
|
||||||
|
|
|
@ -153,7 +153,6 @@ class WNBlocks(nn.Module):
|
||||||
dropout_p=0,
|
dropout_p=0,
|
||||||
weight_norm=True,
|
weight_norm=True,
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.wn_blocks = nn.ModuleList()
|
self.wn_blocks = nn.ModuleList()
|
||||||
for idx in range(num_blocks):
|
for idx in range(num_blocks):
|
||||||
|
|
|
@ -64,7 +64,6 @@ class RelativePositionMultiHeadAttention(nn.Module):
|
||||||
proximal_bias=False,
|
proximal_bias=False,
|
||||||
proximal_init=False,
|
proximal_init=False,
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
assert channels % num_heads == 0, " [!] channels should be divisible by num_heads."
|
assert channels % num_heads == 0, " [!] channels should be divisible by num_heads."
|
||||||
# class attributes
|
# class attributes
|
||||||
|
@ -272,7 +271,6 @@ class FeedForwardNetwork(nn.Module):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dropout_p=0.0, causal=False):
|
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dropout_p=0.0, causal=False):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.in_channels = in_channels
|
self.in_channels = in_channels
|
||||||
self.out_channels = out_channels
|
self.out_channels = out_channels
|
||||||
|
|
|
@ -363,7 +363,6 @@ class TacotronLoss(torch.nn.Module):
|
||||||
alignments_backwards,
|
alignments_backwards,
|
||||||
input_lens,
|
input_lens,
|
||||||
):
|
):
|
||||||
|
|
||||||
# decoder outputs linear or mel spectrograms for Tacotron and Tacotron2
|
# decoder outputs linear or mel spectrograms for Tacotron and Tacotron2
|
||||||
# the target should be set acccordingly
|
# the target should be set acccordingly
|
||||||
postnet_target = linear_input if self.config.model.lower() in ["tacotron"] else mel_input
|
postnet_target = linear_input if self.config.model.lower() in ["tacotron"] else mel_input
|
||||||
|
|
|
@ -22,7 +22,6 @@ class Encoder(nn.Module):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, num_chars, state_per_phone, in_out_channels=512, n_convolutions=3):
|
def __init__(self, num_chars, state_per_phone, in_out_channels=512, n_convolutions=3):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.state_per_phone = state_per_phone
|
self.state_per_phone = state_per_phone
|
||||||
|
|
|
@ -36,7 +36,6 @@ class Decoder(nn.Module):
|
||||||
sigmoid_scale=False,
|
sigmoid_scale=False,
|
||||||
c_in_channels=0,
|
c_in_channels=0,
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.glow_decoder = GlowDecoder(
|
self.glow_decoder = GlowDecoder(
|
||||||
|
|
|
@ -123,7 +123,6 @@ class NeuralHMM(nn.Module):
|
||||||
h_memory, c_memory = self._init_lstm_states(batch_size, self.memory_rnn_dim, mels)
|
h_memory, c_memory = self._init_lstm_states(batch_size, self.memory_rnn_dim, mels)
|
||||||
|
|
||||||
for t in range(T_max):
|
for t in range(T_max):
|
||||||
|
|
||||||
# Process Autoregression
|
# Process Autoregression
|
||||||
h_memory, c_memory = self._process_ar_timestep(t, ar_inputs, h_memory, c_memory)
|
h_memory, c_memory = self._process_ar_timestep(t, ar_inputs, h_memory, c_memory)
|
||||||
# Get mean, std and transition vector from decoder for this timestep
|
# Get mean, std and transition vector from decoder for this timestep
|
||||||
|
@ -418,7 +417,6 @@ class NeuralHMM(nn.Module):
|
||||||
output_parameter_values = []
|
output_parameter_values = []
|
||||||
quantile = 1
|
quantile = 1
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
memory_input = self.prenet(prenet_input.flatten(1).unsqueeze(0))
|
memory_input = self.prenet(prenet_input.flatten(1).unsqueeze(0))
|
||||||
# will be 1 while sampling
|
# will be 1 while sampling
|
||||||
h_memory, c_memory = self.memory_rnn(memory_input.squeeze(0), (h_memory, c_memory))
|
h_memory, c_memory = self.memory_rnn(memory_input.squeeze(0), (h_memory, c_memory))
|
||||||
|
|
|
@ -50,7 +50,6 @@ class GravesAttention(nn.Module):
|
||||||
COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi))
|
COEF = 0.3989422917366028 # numpy.sqrt(1/(2*numpy.pi))
|
||||||
|
|
||||||
def __init__(self, query_dim, K):
|
def __init__(self, query_dim, K):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._mask_value = 1e-8
|
self._mask_value = 1e-8
|
||||||
self.K = K
|
self.K = K
|
||||||
|
|
|
@ -83,7 +83,6 @@ class ReferenceEncoder(nn.Module):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, num_mel, out_dim):
|
def __init__(self, num_mel, out_dim):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.num_mel = num_mel
|
self.num_mel = num_mel
|
||||||
filters = [1] + [32, 32, 64, 64, 128, 128]
|
filters = [1] + [32, 32, 64, 64, 128, 128]
|
||||||
|
|
|
@ -31,7 +31,6 @@ class ReferenceEncoder(nn.Module):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, num_mel, embedding_dim):
|
def __init__(self, num_mel, embedding_dim):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.num_mel = num_mel
|
self.num_mel = num_mel
|
||||||
filters = [1] + [32, 32, 64, 64, 128, 128]
|
filters = [1] + [32, 32, 64, 64, 128, 128]
|
||||||
|
@ -119,7 +118,6 @@ class MultiHeadAttention(nn.Module):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, query_dim, key_dim, num_units, num_heads):
|
def __init__(self, query_dim, key_dim, num_units, num_heads):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.num_units = num_units
|
self.num_units = num_units
|
||||||
self.num_heads = num_heads
|
self.num_heads = num_heads
|
||||||
|
|
|
@ -27,7 +27,6 @@ class BatchNormConv1d(nn.Module):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, in_channels, out_channels, kernel_size, stride, padding, activation=None):
|
def __init__(self, in_channels, out_channels, kernel_size, stride, padding, activation=None):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.padding = padding
|
self.padding = padding
|
||||||
self.padder = nn.ConstantPad1d(padding, 0)
|
self.padder = nn.ConstantPad1d(padding, 0)
|
||||||
|
@ -149,7 +148,7 @@ class CBHG(nn.Module):
|
||||||
activations += [None]
|
activations += [None]
|
||||||
# setup conv1d projection layers
|
# setup conv1d projection layers
|
||||||
layer_set = []
|
layer_set = []
|
||||||
for (in_size, out_size, ac) in zip(out_features, conv_projections, activations):
|
for in_size, out_size, ac in zip(out_features, conv_projections, activations):
|
||||||
layer = BatchNormConv1d(in_size, out_size, kernel_size=3, stride=1, padding=[1, 1], activation=ac)
|
layer = BatchNormConv1d(in_size, out_size, kernel_size=3, stride=1, padding=[1, 1], activation=ac)
|
||||||
layer_set.append(layer)
|
layer_set.append(layer)
|
||||||
self.conv1d_projections = nn.ModuleList(layer_set)
|
self.conv1d_projections = nn.ModuleList(layer_set)
|
||||||
|
|
|
@ -21,7 +21,6 @@ def piecewise_rational_quadratic_transform(
|
||||||
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
||||||
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
||||||
):
|
):
|
||||||
|
|
||||||
if tails is None:
|
if tails is None:
|
||||||
spline_fn = rational_quadratic_spline
|
spline_fn = rational_quadratic_spline
|
||||||
spline_kwargs = {}
|
spline_kwargs = {}
|
||||||
|
|
|
@ -109,7 +109,6 @@ class AlignTTS(BaseTTS):
|
||||||
tokenizer: "TTSTokenizer" = None,
|
tokenizer: "TTSTokenizer" = None,
|
||||||
speaker_manager: SpeakerManager = None,
|
speaker_manager: SpeakerManager = None,
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__(config, ap, tokenizer, speaker_manager)
|
super().__init__(config, ap, tokenizer, speaker_manager)
|
||||||
self.speaker_manager = speaker_manager
|
self.speaker_manager = speaker_manager
|
||||||
self.phase = -1
|
self.phase = -1
|
||||||
|
|
|
@ -252,7 +252,12 @@ class BaseTacotron(BaseTTS):
|
||||||
|
|
||||||
def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None):
|
def compute_capacitron_VAE_embedding(self, inputs, reference_mel_info, text_info=None, speaker_embedding=None):
|
||||||
"""Capacitron Variational Autoencoder"""
|
"""Capacitron Variational Autoencoder"""
|
||||||
(VAE_outputs, posterior_distribution, prior_distribution, capacitron_beta,) = self.capacitron_vae_layer(
|
(
|
||||||
|
VAE_outputs,
|
||||||
|
posterior_distribution,
|
||||||
|
prior_distribution,
|
||||||
|
capacitron_beta,
|
||||||
|
) = self.capacitron_vae_layer(
|
||||||
reference_mel_info,
|
reference_mel_info,
|
||||||
text_info,
|
text_info,
|
||||||
speaker_embedding, # pylint: disable=not-callable
|
speaker_embedding, # pylint: disable=not-callable
|
||||||
|
|
|
@ -357,7 +357,6 @@ class BaseTTS(BaseTrainerModel):
|
||||||
def _get_test_aux_input(
|
def _get_test_aux_input(
|
||||||
self,
|
self,
|
||||||
) -> Dict:
|
) -> Dict:
|
||||||
|
|
||||||
d_vector = None
|
d_vector = None
|
||||||
if self.config.use_d_vector_file:
|
if self.config.use_d_vector_file:
|
||||||
d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings]
|
d_vector = [self.speaker_manager.embeddings[name]["embedding"] for name in self.speaker_manager.embeddings]
|
||||||
|
|
|
@ -63,7 +63,6 @@ class GlowTTS(BaseTTS):
|
||||||
tokenizer: "TTSTokenizer" = None,
|
tokenizer: "TTSTokenizer" = None,
|
||||||
speaker_manager: SpeakerManager = None,
|
speaker_manager: SpeakerManager = None,
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__(config, ap, tokenizer, speaker_manager)
|
super().__init__(config, ap, tokenizer, speaker_manager)
|
||||||
|
|
||||||
# pass all config fields to `self`
|
# pass all config fields to `self`
|
||||||
|
|
|
@ -36,7 +36,6 @@ class Tacotron(BaseTacotron):
|
||||||
tokenizer: "TTSTokenizer" = None,
|
tokenizer: "TTSTokenizer" = None,
|
||||||
speaker_manager: SpeakerManager = None,
|
speaker_manager: SpeakerManager = None,
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__(config, ap, tokenizer, speaker_manager)
|
super().__init__(config, ap, tokenizer, speaker_manager)
|
||||||
|
|
||||||
# pass all config fields to `self`
|
# pass all config fields to `self`
|
||||||
|
|
|
@ -50,7 +50,6 @@ class Tacotron2(BaseTacotron):
|
||||||
tokenizer: "TTSTokenizer" = None,
|
tokenizer: "TTSTokenizer" = None,
|
||||||
speaker_manager: SpeakerManager = None,
|
speaker_manager: SpeakerManager = None,
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__(config, ap, tokenizer, speaker_manager)
|
super().__init__(config, ap, tokenizer, speaker_manager)
|
||||||
|
|
||||||
self.decoder_output_dim = config.out_channels
|
self.decoder_output_dim = config.out_channels
|
||||||
|
|
|
@ -633,7 +633,6 @@ class Vits(BaseTTS):
|
||||||
speaker_manager: SpeakerManager = None,
|
speaker_manager: SpeakerManager = None,
|
||||||
language_manager: LanguageManager = None,
|
language_manager: LanguageManager = None,
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__(config, ap, tokenizer, speaker_manager, language_manager)
|
super().__init__(config, ap, tokenizer, speaker_manager, language_manager)
|
||||||
|
|
||||||
self.init_multispeaker(config)
|
self.init_multispeaker(config)
|
||||||
|
@ -1280,7 +1279,6 @@ class Vits(BaseTTS):
|
||||||
|
|
||||||
# compute melspec segment
|
# compute melspec segment
|
||||||
with autocast(enabled=False):
|
with autocast(enabled=False):
|
||||||
|
|
||||||
if self.args.encoder_sample_rate:
|
if self.args.encoder_sample_rate:
|
||||||
spec_segment_size = self.spec_segment_size * int(self.interpolate_factor)
|
spec_segment_size = self.spec_segment_size * int(self.interpolate_factor)
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -32,7 +32,6 @@ class BasePhonemizer(abc.ABC):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
|
def __init__(self, language, punctuations=Punctuation.default_puncs(), keep_puncs=False):
|
||||||
|
|
||||||
# ensure the backend is installed on the system
|
# ensure the backend is installed on the system
|
||||||
if not self.is_available():
|
if not self.is_available():
|
||||||
raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover
|
raise RuntimeError("{} not installed on your system".format(self.name())) # pragma: nocover
|
||||||
|
|
|
@ -158,7 +158,6 @@ class AudioProcessor(object):
|
||||||
verbose=True,
|
verbose=True,
|
||||||
**_,
|
**_,
|
||||||
):
|
):
|
||||||
|
|
||||||
# setup class attributed
|
# setup class attributed
|
||||||
self.sample_rate = sample_rate
|
self.sample_rate = sample_rate
|
||||||
self.resample = resample
|
self.resample = resample
|
||||||
|
|
|
@ -43,7 +43,6 @@ def stream_url(
|
||||||
total=url_size,
|
total=url_size,
|
||||||
disable=not progress_bar,
|
disable=not progress_bar,
|
||||||
) as pbar:
|
) as pbar:
|
||||||
|
|
||||||
num_bytes = 0
|
num_bytes = 0
|
||||||
while True:
|
while True:
|
||||||
chunk = upointer.read(block_size)
|
chunk = upointer.read(block_size)
|
||||||
|
|
|
@ -31,13 +31,11 @@ class RAdam(Optimizer):
|
||||||
super().__setstate__(state)
|
super().__setstate__(state)
|
||||||
|
|
||||||
def step(self, closure=None):
|
def step(self, closure=None):
|
||||||
|
|
||||||
loss = None
|
loss = None
|
||||||
if closure is not None:
|
if closure is not None:
|
||||||
loss = closure()
|
loss = closure()
|
||||||
|
|
||||||
for group in self.param_groups:
|
for group in self.param_groups:
|
||||||
|
|
||||||
for p in group["params"]:
|
for p in group["params"]:
|
||||||
if p.grad is None:
|
if p.grad is None:
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -72,7 +72,6 @@ class PerfectBatchSampler(Sampler):
|
||||||
self._num_classes_in_batch = num_classes_in_batch
|
self._num_classes_in_batch = num_classes_in_batch
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
|
||||||
batch = []
|
batch = []
|
||||||
if self._num_classes_in_batch != len(self._samplers):
|
if self._num_classes_in_batch != len(self._samplers):
|
||||||
valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)
|
valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)
|
||||||
|
|
|
@ -212,7 +212,6 @@ class Synthesizer(object):
|
||||||
speaker_embedding = None
|
speaker_embedding = None
|
||||||
speaker_id = None
|
speaker_id = None
|
||||||
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
|
if self.tts_speakers_file or hasattr(self.tts_model.speaker_manager, "name_to_id"):
|
||||||
|
|
||||||
# handle Neon models with single speaker.
|
# handle Neon models with single speaker.
|
||||||
if len(self.tts_model.speaker_manager.name_to_id) == 1:
|
if len(self.tts_model.speaker_manager.name_to_id) == 1:
|
||||||
speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0]
|
speaker_id = list(self.tts_model.speaker_manager.name_to_id.values())[0]
|
||||||
|
@ -247,7 +246,6 @@ class Synthesizer(object):
|
||||||
if self.tts_languages_file or (
|
if self.tts_languages_file or (
|
||||||
hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
|
hasattr(self.tts_model, "language_manager") and self.tts_model.language_manager is not None
|
||||||
):
|
):
|
||||||
|
|
||||||
if len(self.tts_model.language_manager.name_to_id) == 1:
|
if len(self.tts_model.language_manager.name_to_id) == 1:
|
||||||
language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
|
language_id = list(self.tts_model.language_manager.name_to_id.values())[0]
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,6 @@ def get_vad_model_and_utils(use_cuda=False):
|
||||||
def remove_silence(
|
def remove_silence(
|
||||||
model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False
|
model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False
|
||||||
):
|
):
|
||||||
|
|
||||||
# get the VAD model and utils functions
|
# get the VAD model and utils functions
|
||||||
model, get_speech_timestamps, _, collect_chunks = model_and_utils
|
model, get_speech_timestamps, _, collect_chunks = model_and_utils
|
||||||
|
|
||||||
|
|
|
@ -118,7 +118,6 @@ class GANDataset(Dataset):
|
||||||
mel = self.ap.melspectrogram(audio)
|
mel = self.ap.melspectrogram(audio)
|
||||||
audio, mel = self._pad_short_samples(audio, mel)
|
audio, mel = self._pad_short_samples(audio, mel)
|
||||||
else:
|
else:
|
||||||
|
|
||||||
# load precomputed features
|
# load precomputed features
|
||||||
wavpath, feat_path = self.item_list[idx]
|
wavpath, feat_path = self.item_list[idx]
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,6 @@ class WaveGradDataset(Dataset):
|
||||||
use_cache=False,
|
use_cache=False,
|
||||||
verbose=False,
|
verbose=False,
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.ap = ap
|
self.ap = ap
|
||||||
self.item_list = items
|
self.item_list = items
|
||||||
|
|
|
@ -12,7 +12,6 @@ class WaveRNNDataset(Dataset):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, verbose=False, return_segments=True
|
self, ap, items, seq_len, hop_len, pad, mode, mulaw, is_training=True, verbose=False, return_segments=True
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.ap = ap
|
self.ap = ap
|
||||||
self.compute_feat = not isinstance(items[0], (tuple, list))
|
self.compute_feat = not isinstance(items[0], (tuple, list))
|
||||||
|
@ -52,7 +51,6 @@ class WaveRNNDataset(Dataset):
|
||||||
else compute it on the fly
|
else compute it on the fly
|
||||||
"""
|
"""
|
||||||
if self.compute_feat:
|
if self.compute_feat:
|
||||||
|
|
||||||
wavpath = self.item_list[index]
|
wavpath = self.item_list[index]
|
||||||
audio = self.ap.load_wav(wavpath)
|
audio = self.ap.load_wav(wavpath)
|
||||||
if self.return_segments:
|
if self.return_segments:
|
||||||
|
@ -74,7 +72,6 @@ class WaveRNNDataset(Dataset):
|
||||||
raise RuntimeError("Unknown dataset mode - ", self.mode)
|
raise RuntimeError("Unknown dataset mode - ", self.mode)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
||||||
wavpath, feat_path = self.item_list[index]
|
wavpath, feat_path = self.item_list[index]
|
||||||
mel = np.load(feat_path.replace("/quant/", "/mel/"))
|
mel = np.load(feat_path.replace("/quant/", "/mel/"))
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,6 @@ class ParallelWaveganGenerator(torch.nn.Module):
|
||||||
upsample_factors=[4, 4, 4, 4],
|
upsample_factors=[4, 4, 4, 4],
|
||||||
inference_padding=2,
|
inference_padding=2,
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.in_channels = in_channels
|
self.in_channels = in_channels
|
||||||
self.out_channels = out_channels
|
self.out_channels = out_channels
|
||||||
|
|
|
@ -77,7 +77,7 @@ class ConditionalDiscriminator(nn.Module):
|
||||||
# layers before condition features
|
# layers before condition features
|
||||||
self.pre_cond_layers += [DBlock(in_channels, 64, 1)]
|
self.pre_cond_layers += [DBlock(in_channels, 64, 1)]
|
||||||
in_channels = 64
|
in_channels = 64
|
||||||
for (i, channel) in enumerate(out_channels):
|
for i, channel in enumerate(out_channels):
|
||||||
self.pre_cond_layers.append(DBlock(in_channels, channel, downsample_factors[i]))
|
self.pre_cond_layers.append(DBlock(in_channels, channel, downsample_factors[i]))
|
||||||
in_channels = channel
|
in_channels = channel
|
||||||
|
|
||||||
|
@ -116,7 +116,7 @@ class UnconditionalDiscriminator(nn.Module):
|
||||||
self.layers = nn.ModuleList()
|
self.layers = nn.ModuleList()
|
||||||
self.layers += [DBlock(self.in_channels, base_channels, 1)]
|
self.layers += [DBlock(self.in_channels, base_channels, 1)]
|
||||||
in_channels = base_channels
|
in_channels = base_channels
|
||||||
for (i, factor) in enumerate(downsample_factors):
|
for i, factor in enumerate(downsample_factors):
|
||||||
self.layers.append(DBlock(in_channels, out_channels[i], factor))
|
self.layers.append(DBlock(in_channels, out_channels[i], factor))
|
||||||
in_channels *= 2
|
in_channels *= 2
|
||||||
self.layers += [
|
self.layers += [
|
||||||
|
@ -147,7 +147,6 @@ class RandomWindowDiscriminator(nn.Module):
|
||||||
cond_disc_out_channels=((128, 128, 256, 256), (128, 256, 256), (128, 256), (256,), (128, 256)),
|
cond_disc_out_channels=((128, 128, 256, 256), (128, 256, 256), (128, 256), (256,), (128, 256)),
|
||||||
window_sizes=(512, 1024, 2048, 4096, 8192),
|
window_sizes=(512, 1024, 2048, 4096, 8192),
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.cond_channels = cond_channels
|
self.cond_channels = cond_channels
|
||||||
self.window_sizes = window_sizes
|
self.window_sizes = window_sizes
|
||||||
|
@ -185,14 +184,14 @@ class RandomWindowDiscriminator(nn.Module):
|
||||||
scores = []
|
scores = []
|
||||||
feats = []
|
feats = []
|
||||||
# unconditional pass
|
# unconditional pass
|
||||||
for (window_size, layer) in zip(self.window_sizes, self.unconditional_discriminators):
|
for window_size, layer in zip(self.window_sizes, self.unconditional_discriminators):
|
||||||
index = np.random.randint(x.shape[-1] - window_size)
|
index = np.random.randint(x.shape[-1] - window_size)
|
||||||
|
|
||||||
score = layer(x[:, :, index : index + window_size])
|
score = layer(x[:, :, index : index + window_size])
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
|
|
||||||
# conditional pass
|
# conditional pass
|
||||||
for (window_size, layer) in zip(self.window_sizes, self.conditional_discriminators):
|
for window_size, layer in zip(self.window_sizes, self.conditional_discriminators):
|
||||||
frame_size = window_size // self.hop_length
|
frame_size = window_size // self.hop_length
|
||||||
lc_index = np.random.randint(c.shape[-1] - frame_size)
|
lc_index = np.random.randint(c.shape[-1] - frame_size)
|
||||||
sample_index = lc_index * self.hop_length
|
sample_index = lc_index * self.hop_length
|
||||||
|
|
|
@ -32,7 +32,6 @@ class SpecDiscriminator(nn.Module):
|
||||||
self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1))
|
self.out = norm_f(nn.Conv2d(32, 1, 3, 1, 1))
|
||||||
|
|
||||||
def forward(self, y):
|
def forward(self, y):
|
||||||
|
|
||||||
fmap = []
|
fmap = []
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
y = y.squeeze(1)
|
y = y.squeeze(1)
|
||||||
|
@ -53,7 +52,6 @@ class MultiResSpecDiscriminator(torch.nn.Module):
|
||||||
def __init__( # pylint: disable=dangerous-default-value
|
def __init__( # pylint: disable=dangerous-default-value
|
||||||
self, fft_sizes=[1024, 2048, 512], hop_sizes=[120, 240, 50], win_lengths=[600, 1200, 240], window="hann_window"
|
self, fft_sizes=[1024, 2048, 512], hop_sizes=[120, 240, 50], win_lengths=[600, 1200, 240], window="hann_window"
|
||||||
):
|
):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.discriminators = nn.ModuleList(
|
self.discriminators = nn.ModuleList(
|
||||||
[
|
[
|
||||||
|
|
|
@ -312,7 +312,6 @@ class Wavernn(BaseVocoder):
|
||||||
return self.fc3(x)
|
return self.fc3(x)
|
||||||
|
|
||||||
def inference(self, mels, batched=None, target=None, overlap=None):
|
def inference(self, mels, batched=None, target=None, overlap=None):
|
||||||
|
|
||||||
self.eval()
|
self.eval()
|
||||||
output = []
|
output = []
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
@ -346,7 +345,6 @@ class Wavernn(BaseVocoder):
|
||||||
aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)]
|
aux_split = [aux[:, :, d * i : d * (i + 1)] for i in range(4)]
|
||||||
|
|
||||||
for i in range(seq_len):
|
for i in range(seq_len):
|
||||||
|
|
||||||
m_t = mels[:, i, :]
|
m_t = mels[:, i, :]
|
||||||
|
|
||||||
if self.args.use_aux_net:
|
if self.args.use_aux_net:
|
||||||
|
|
|
@ -9,6 +9,7 @@ from TTS.tts.models import setup_model
|
||||||
|
|
||||||
torch.manual_seed(1)
|
torch.manual_seed(1)
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=protected-access
|
# pylint: disable=protected-access
|
||||||
class TestExtractTTSSpectrograms(unittest.TestCase):
|
class TestExtractTTSSpectrograms(unittest.TestCase):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -29,6 +29,7 @@ dataset_config_pt = BaseDatasetConfig(
|
||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=protected-access
|
# pylint: disable=protected-access
|
||||||
class TestFindUniquePhonemes(unittest.TestCase):
|
class TestFindUniquePhonemes(unittest.TestCase):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -129,7 +129,6 @@ class AngleProtoLossTests(unittest.TestCase):
|
||||||
class SoftmaxAngleProtoLossTests(unittest.TestCase):
|
class SoftmaxAngleProtoLossTests(unittest.TestCase):
|
||||||
# pylint: disable=R0201
|
# pylint: disable=R0201
|
||||||
def test_in_out(self):
|
def test_in_out(self):
|
||||||
|
|
||||||
embedding_dim = 64
|
embedding_dim = 64
|
||||||
num_speakers = 5
|
num_speakers = 5
|
||||||
batch_size = 4
|
batch_size = 4
|
||||||
|
|
|
@ -45,7 +45,6 @@ class TestTTSDataset(unittest.TestCase):
|
||||||
self.ap = AudioProcessor(**c.audio)
|
self.ap = AudioProcessor(**c.audio)
|
||||||
|
|
||||||
def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False):
|
def _create_dataloader(self, batch_size, r, bgs, start_by_longest=False):
|
||||||
|
|
||||||
# load dataset
|
# load dataset
|
||||||
meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
|
meta_data_train, meta_data_eval = load_tts_samples(dataset_config, eval_split=True, eval_split_size=0.2)
|
||||||
items = meta_data_train + meta_data_eval
|
items = meta_data_train + meta_data_eval
|
||||||
|
|
|
@ -75,7 +75,6 @@ class TestSamplers(unittest.TestCase):
|
||||||
assert is_balanced(en, pt), "Language Weighted sampler is supposed to be balanced"
|
assert is_balanced(en, pt), "Language Weighted sampler is supposed to be balanced"
|
||||||
|
|
||||||
def test_speaker_weighted_random_sampler(self): # pylint: disable=no-self-use
|
def test_speaker_weighted_random_sampler(self): # pylint: disable=no-self-use
|
||||||
|
|
||||||
weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(
|
weighted_sampler = torch.utils.data.sampler.WeightedRandomSampler(
|
||||||
get_speaker_balancer_weights(train_samples), len(train_samples)
|
get_speaker_balancer_weights(train_samples), len(train_samples)
|
||||||
)
|
)
|
||||||
|
|
|
@ -2,7 +2,6 @@ import os
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from tests import get_tests_data_path, get_tests_output_path
|
from tests import get_tests_data_path, get_tests_output_path
|
||||||
|
|
||||||
from TTS.api import TTS
|
from TTS.api import TTS
|
||||||
|
|
||||||
OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav")
|
OUTPUT_PATH = os.path.join(get_tests_output_path(), "test_python_api.wav")
|
||||||
|
|
|
@ -235,7 +235,6 @@ class TestMultiPhonemizer(unittest.TestCase):
|
||||||
self.phonemizer = MultiPhonemizer({"tr": "espeak", "en-us": "", "de": "gruut", "zh-cn": ""})
|
self.phonemizer = MultiPhonemizer({"tr": "espeak", "en-us": "", "de": "gruut", "zh-cn": ""})
|
||||||
|
|
||||||
def test_phonemize(self):
|
def test_phonemize(self):
|
||||||
|
|
||||||
# Enlish espeak
|
# Enlish espeak
|
||||||
text = "Be a voice, not an! echo?"
|
text = "Be a voice, not an! echo?"
|
||||||
gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ?"
|
gt = "biː ɐ vˈɔɪs, nˈɑːt æn! ˈɛkoʊ?"
|
||||||
|
|
|
@ -332,7 +332,6 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def test_train_step():
|
def test_train_step():
|
||||||
|
|
||||||
config = config_global.copy()
|
config = config_global.copy()
|
||||||
config.use_d_vector_file = True
|
config.use_d_vector_file = True
|
||||||
|
|
||||||
|
|
|
@ -401,7 +401,6 @@ class TestVits(unittest.TestCase):
|
||||||
def test_train_step(self):
|
def test_train_step(self):
|
||||||
# setup the model
|
# setup the model
|
||||||
with torch.autograd.set_detect_anomaly(True):
|
with torch.autograd.set_detect_anomaly(True):
|
||||||
|
|
||||||
config = VitsConfig(model_args=VitsArgs(num_chars=32, spec_segment_size=10))
|
config = VitsConfig(model_args=VitsArgs(num_chars=32, spec_segment_size=10))
|
||||||
model = Vits(config).to(device)
|
model = Vits(config).to(device)
|
||||||
model.train()
|
model.train()
|
||||||
|
|
Loading…
Reference in New Issue