mirror of https://github.com/coqui-ai/TTS.git
a ton of linter updates
This commit is contained in:
parent
4422642ec0
commit
9a48ba3821
|
@ -170,7 +170,7 @@ def main():
|
||||||
args.vocoder_name = model_item['default_vocoder'] if args.vocoder_name is None else args.vocoder_name
|
args.vocoder_name = model_item['default_vocoder'] if args.vocoder_name is None else args.vocoder_name
|
||||||
|
|
||||||
if args.vocoder_name is not None:
|
if args.vocoder_name is not None:
|
||||||
vocoder_path, vocoder_config_path, vocoder_item = manager.download_model(args.vocoder_name)
|
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
|
||||||
|
|
||||||
# CASE3: load custome models
|
# CASE3: load custome models
|
||||||
if args.model_path is not None:
|
if args.model_path is not None:
|
||||||
|
|
|
@ -573,7 +573,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
if c.run_eval:
|
if c.run_eval:
|
||||||
target_loss = eval_avg_loss_dict['avg_loss']
|
target_loss = eval_avg_loss_dict['avg_loss']
|
||||||
best_loss = save_best_model(target_loss, best_loss, model, optimizer,
|
best_loss = save_best_model(target_loss, best_loss, model, optimizer,
|
||||||
global_step, epoch, c.r, OUT_PATH,
|
global_step, epoch, c.r, OUT_PATH, model_characters,
|
||||||
keep_all_best=keep_all_best, keep_after=keep_after)
|
keep_all_best=keep_all_best, keep_after=keep_after)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import argparse
|
|
||||||
import glob
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
@ -535,7 +533,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
if c.run_eval:
|
if c.run_eval:
|
||||||
target_loss = eval_avg_loss_dict['avg_loss']
|
target_loss = eval_avg_loss_dict['avg_loss']
|
||||||
best_loss = save_best_model(target_loss, best_loss, model, optimizer,
|
best_loss = save_best_model(target_loss, best_loss, model, optimizer,
|
||||||
global_step, epoch, c.r, OUT_PATH,
|
global_step, epoch, c.r, OUT_PATH, model_characters,
|
||||||
keep_all_best=keep_all_best, keep_after=keep_after)
|
keep_all_best=keep_all_best, keep_after=keep_after)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -648,12 +648,14 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
epoch,
|
epoch,
|
||||||
c.r,
|
c.r,
|
||||||
OUT_PATH,
|
OUT_PATH,
|
||||||
|
model_characters,
|
||||||
keep_all_best=keep_all_best,
|
keep_all_best=keep_all_best,
|
||||||
keep_after=keep_after,
|
keep_after=keep_after,
|
||||||
scaler=scaler.state_dict() if c.mixed_precision else None
|
scaler=scaler.state_dict() if c.mixed_precision else None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
args = parse_arguments(sys.argv)
|
args = parse_arguments(sys.argv)
|
||||||
c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
|
c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
|
||||||
|
|
|
@ -50,7 +50,7 @@ def setup_loader(ap, is_val=False, verbose=False):
|
||||||
sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None
|
sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None
|
||||||
loader = DataLoader(dataset,
|
loader = DataLoader(dataset,
|
||||||
batch_size=1 if is_val else c.batch_size,
|
batch_size=1 if is_val else c.batch_size,
|
||||||
shuffle=False if num_gpus > 1 else True,
|
shuffle=num_gpus == 0,
|
||||||
drop_last=False,
|
drop_last=False,
|
||||||
sampler=sampler,
|
sampler=sampler,
|
||||||
num_workers=c.num_val_loader_workers
|
num_workers=c.num_val_loader_workers
|
||||||
|
|
|
@ -59,7 +59,7 @@ if args.list_models:
|
||||||
# set models by the released models
|
# set models by the released models
|
||||||
if args.model_name is not None:
|
if args.model_name is not None:
|
||||||
tts_checkpoint_file, tts_config_file, tts_json_dict = manager.download_model(args.model_name)
|
tts_checkpoint_file, tts_config_file, tts_json_dict = manager.download_model(args.model_name)
|
||||||
args.vocoder_name = tts_json_dict['default_vocoder'] if args.vocoder_name is None else args.vocoder_name
|
args.vocoder_name = tts_json_dict['default_vocoder'] if args.vocoder_name is None else args.vocoder_name
|
||||||
|
|
||||||
if args.vocoder_name is not None:
|
if args.vocoder_name is not None:
|
||||||
vocoder_checkpoint_file, vocoder_config_file, vocoder_json_dict = manager.download_model(args.vocoder_name)
|
vocoder_checkpoint_file, vocoder_config_file, vocoder_json_dict = manager.download_model(args.vocoder_name)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import collections
|
import collections
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
from multiprocessing import Manager, Pool
|
from multiprocessing import Pool
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
|
@ -3,7 +3,7 @@ from glob import glob
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Tuple
|
from typing import List
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
|
@ -367,18 +367,18 @@ class MonotonicDynamicConvolutionAttention(nn.Module):
|
||||||
beta (float, optional): [description]. Defaults to 0.9 from the paper.
|
beta (float, optional): [description]. Defaults to 0.9 from the paper.
|
||||||
"""
|
"""
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
query_dim,
|
query_dim,
|
||||||
embedding_dim, # pylint: disable=unused-argument
|
embedding_dim, # pylint: disable=unused-argument
|
||||||
attention_dim,
|
attention_dim,
|
||||||
static_filter_dim,
|
static_filter_dim,
|
||||||
static_kernel_size,
|
static_kernel_size,
|
||||||
dynamic_filter_dim,
|
dynamic_filter_dim,
|
||||||
dynamic_kernel_size,
|
dynamic_kernel_size,
|
||||||
prior_filter_len=11,
|
prior_filter_len=11,
|
||||||
alpha=0.1,
|
alpha=0.1,
|
||||||
beta=0.9,
|
beta=0.9,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self._mask_value = 1e-8
|
self._mask_value = 1e-8
|
||||||
self.dynamic_filter_dim = dynamic_filter_dim
|
self.dynamic_filter_dim = dynamic_filter_dim
|
||||||
|
@ -402,7 +402,7 @@ class MonotonicDynamicConvolutionAttention(nn.Module):
|
||||||
self.v = nn.Linear(attention_dim, 1, bias=False)
|
self.v = nn.Linear(attention_dim, 1, bias=False)
|
||||||
|
|
||||||
prior = betabinom.pmf(range(prior_filter_len), prior_filter_len - 1,
|
prior = betabinom.pmf(range(prior_filter_len), prior_filter_len - 1,
|
||||||
alpha, beta)
|
alpha, beta)
|
||||||
self.register_buffer("prior", torch.FloatTensor(prior).flip(0))
|
self.register_buffer("prior", torch.FloatTensor(prior).flip(0))
|
||||||
|
|
||||||
# pylint: disable=unused-argument
|
# pylint: disable=unused-argument
|
||||||
|
|
|
@ -97,7 +97,7 @@ class ResidualConv1dBNBlock(nn.Module):
|
||||||
assert len(dilations) == num_res_blocks
|
assert len(dilations) == num_res_blocks
|
||||||
self.res_blocks = nn.ModuleList()
|
self.res_blocks = nn.ModuleList()
|
||||||
for idx, dilation in enumerate(dilations):
|
for idx, dilation in enumerate(dilations):
|
||||||
block = Conv1dBNBlock(in_channels if idx==0 else hidden_channels,
|
block = Conv1dBNBlock(in_channels if idx == 0 else hidden_channels,
|
||||||
out_channels if (idx + 1) == len(dilations) else hidden_channels,
|
out_channels if (idx + 1) == len(dilations) else hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
kernel_size,
|
kernel_size,
|
||||||
|
|
|
@ -98,11 +98,11 @@ class Encoder(nn.Module):
|
||||||
if encoder_type.lower() == "rel_pos_transformer":
|
if encoder_type.lower() == "rel_pos_transformer":
|
||||||
if use_prenet:
|
if use_prenet:
|
||||||
self.prenet = ResidualConv1dLayerNormBlock(hidden_channels,
|
self.prenet = ResidualConv1dLayerNormBlock(hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
kernel_size=5,
|
kernel_size=5,
|
||||||
num_layers=3,
|
num_layers=3,
|
||||||
dropout_p=0.5)
|
dropout_p=0.5)
|
||||||
self.encoder = RelativePositionTransformer(hidden_channels,
|
self.encoder = RelativePositionTransformer(hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
|
@ -125,11 +125,11 @@ class Encoder(nn.Module):
|
||||||
elif encoder_type.lower() == 'time_depth_separable':
|
elif encoder_type.lower() == 'time_depth_separable':
|
||||||
if use_prenet:
|
if use_prenet:
|
||||||
self.prenet = ResidualConv1dLayerNormBlock(hidden_channels,
|
self.prenet = ResidualConv1dLayerNormBlock(hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
kernel_size=5,
|
kernel_size=5,
|
||||||
num_layers=3,
|
num_layers=3,
|
||||||
dropout_p=0.5)
|
dropout_p=0.5)
|
||||||
self.encoder = TimeDepthSeparableConvBlock(hidden_channels,
|
self.encoder = TimeDepthSeparableConvBlock(hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
|
|
|
@ -366,8 +366,10 @@ class RelativePositionTransformer(nn.Module):
|
||||||
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||||
|
|
||||||
self.ffn_layers.append(
|
self.ffn_layers.append(
|
||||||
FeedForwardNetwork(hidden_channels,
|
FeedForwardNetwork(
|
||||||
hidden_channels if (idx + 1) != self.num_layers else out_channels,
|
hidden_channels,
|
||||||
|
hidden_channels if
|
||||||
|
(idx + 1) != self.num_layers else out_channels,
|
||||||
hidden_channels_ffn,
|
hidden_channels_ffn,
|
||||||
kernel_size,
|
kernel_size,
|
||||||
dropout_p=dropout_p))
|
dropout_p=dropout_p))
|
||||||
|
|
|
@ -75,7 +75,7 @@ class ReferenceEncoder(nn.Module):
|
||||||
# x: 3D tensor [batch_size, post_conv_width,
|
# x: 3D tensor [batch_size, post_conv_width,
|
||||||
# num_channels*post_conv_height]
|
# num_channels*post_conv_height]
|
||||||
self.recurrence.flatten_parameters()
|
self.recurrence.flatten_parameters()
|
||||||
memory, out = self.recurrence(x)
|
_, out = self.recurrence(x)
|
||||||
# out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
|
# out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
|
||||||
|
|
||||||
return out.squeeze(0)
|
return out.squeeze(0)
|
||||||
|
|
|
@ -2,13 +2,12 @@ import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from inspect import signature
|
|
||||||
from torch.nn import functional
|
from torch.nn import functional
|
||||||
from TTS.tts.utils.generic_utils import sequence_mask
|
from TTS.tts.utils.generic_utils import sequence_mask
|
||||||
from TTS.tts.utils.ssim import ssim
|
from TTS.tts.utils.ssim import ssim
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=abstract-method Method
|
# pylint: disable=abstract-method
|
||||||
# relates https://github.com/pytorch/pytorch/issues/42305
|
# relates https://github.com/pytorch/pytorch/issues/42305
|
||||||
class L1LossMasked(nn.Module):
|
class L1LossMasked(nn.Module):
|
||||||
def __init__(self, seq_len_norm):
|
def __init__(self, seq_len_norm):
|
||||||
|
@ -165,7 +164,7 @@ class BCELossMasked(nn.Module):
|
||||||
target.requires_grad = False
|
target.requires_grad = False
|
||||||
if length is not None:
|
if length is not None:
|
||||||
mask = sequence_mask(sequence_length=length,
|
mask = sequence_mask(sequence_length=length,
|
||||||
max_len=target.size(1)).float()
|
max_len=target.size(1)).float()
|
||||||
x = x * mask
|
x = x * mask
|
||||||
target = target * mask
|
target = target * mask
|
||||||
num_items = mask.sum()
|
num_items = mask.sum()
|
||||||
|
@ -310,10 +309,10 @@ class TacotronLoss(torch.nn.Module):
|
||||||
if self.postnet_alpha > 0:
|
if self.postnet_alpha > 0:
|
||||||
if self.config.model in ["Tacotron", "TacotronGST"]:
|
if self.config.model in ["Tacotron", "TacotronGST"]:
|
||||||
postnet_loss = self.criterion(postnet_output, linear_input,
|
postnet_loss = self.criterion(postnet_output, linear_input,
|
||||||
output_lens)
|
output_lens)
|
||||||
else:
|
else:
|
||||||
postnet_loss = self.criterion(postnet_output, mel_input,
|
postnet_loss = self.criterion(postnet_output, mel_input,
|
||||||
output_lens)
|
output_lens)
|
||||||
else:
|
else:
|
||||||
if self.decoder_alpha > 0:
|
if self.decoder_alpha > 0:
|
||||||
decoder_loss = self.criterion(decoder_output, mel_input)
|
decoder_loss = self.criterion(decoder_output, mel_input)
|
||||||
|
|
|
@ -146,17 +146,17 @@ class Decoder(nn.Module):
|
||||||
|
|
||||||
# pylint: disable=dangerous-default-value
|
# pylint: disable=dangerous-default-value
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
out_channels,
|
out_channels,
|
||||||
in_hidden_channels,
|
in_hidden_channels,
|
||||||
decoder_type='residual_conv_bn',
|
decoder_type='residual_conv_bn',
|
||||||
decoder_params={
|
decoder_params={
|
||||||
"kernel_size": 4,
|
"kernel_size": 4,
|
||||||
"dilations": 4 * [1, 2, 4, 8] + [1],
|
"dilations": 4 * [1, 2, 4, 8] + [1],
|
||||||
"num_conv_blocks": 2,
|
"num_conv_blocks": 2,
|
||||||
"num_res_blocks": 17
|
"num_res_blocks": 17
|
||||||
},
|
},
|
||||||
c_in_channels=0):
|
c_in_channels=0):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
if decoder_type == 'transformer':
|
if decoder_type == 'transformer':
|
||||||
|
|
|
@ -73,13 +73,12 @@ class RelativePositionTransformerEncoder(nn.Module):
|
||||||
def __init__(self, in_channels, out_channels, hidden_channels, params):
|
def __init__(self, in_channels, out_channels, hidden_channels, params):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.prenet = ResidualConv1dBNBlock(in_channels,
|
self.prenet = ResidualConv1dBNBlock(in_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
kernel_size=5,
|
kernel_size=5,
|
||||||
num_res_blocks=3,
|
num_res_blocks=3,
|
||||||
num_conv_blocks=1,
|
num_conv_blocks=1,
|
||||||
dilations=[1, 1, 1]
|
dilations=[1, 1, 1])
|
||||||
)
|
|
||||||
self.rel_pos_transformer = RelativePositionTransformer(
|
self.rel_pos_transformer = RelativePositionTransformer(
|
||||||
hidden_channels, out_channels, hidden_channels, **params)
|
hidden_channels, out_channels, hidden_channels, **params)
|
||||||
|
|
||||||
|
@ -104,9 +103,8 @@ class ResidualConv1dBNEncoder(nn.Module):
|
||||||
"""
|
"""
|
||||||
def __init__(self, in_channels, out_channels, hidden_channels, params):
|
def __init__(self, in_channels, out_channels, hidden_channels, params):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.prenet = nn.Sequential(
|
self.prenet = nn.Sequential(nn.Conv1d(in_channels, hidden_channels, 1),
|
||||||
nn.Conv1d(in_channels, hidden_channels, 1),
|
nn.ReLU())
|
||||||
nn.ReLU())
|
|
||||||
self.res_conv_block = ResidualConv1dBNBlock(hidden_channels,
|
self.res_conv_block = ResidualConv1dBNBlock(hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
hidden_channels, **params)
|
hidden_channels, **params)
|
||||||
|
@ -162,17 +160,17 @@ class Encoder(nn.Module):
|
||||||
}
|
}
|
||||||
"""
|
"""
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
in_hidden_channels,
|
in_hidden_channels,
|
||||||
out_channels,
|
out_channels,
|
||||||
encoder_type='residual_conv_bn',
|
encoder_type='residual_conv_bn',
|
||||||
encoder_params={
|
encoder_params={
|
||||||
"kernel_size": 4,
|
"kernel_size": 4,
|
||||||
"dilations": 4 * [1, 2, 4] + [1],
|
"dilations": 4 * [1, 2, 4] + [1],
|
||||||
"num_conv_blocks": 2,
|
"num_conv_blocks": 2,
|
||||||
"num_res_blocks": 13
|
"num_res_blocks": 13
|
||||||
},
|
},
|
||||||
c_in_channels=0):
|
c_in_channels=0):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.out_channels = out_channels
|
self.out_channels = out_channels
|
||||||
self.in_channels = in_hidden_channels
|
self.in_channels = in_hidden_channels
|
||||||
|
@ -183,10 +181,9 @@ class Encoder(nn.Module):
|
||||||
# init encoder
|
# init encoder
|
||||||
if encoder_type.lower() == "transformer":
|
if encoder_type.lower() == "transformer":
|
||||||
# text encoder
|
# text encoder
|
||||||
self.encoder = RelativePositionTransformerEncoder(in_hidden_channels,
|
self.encoder = RelativePositionTransformerEncoder(
|
||||||
out_channels,
|
in_hidden_channels, out_channels, in_hidden_channels,
|
||||||
in_hidden_channels,
|
encoder_params) # pylint: disable=unexpected-keyword-arg
|
||||||
encoder_params) # pylint: disable=unexpected-keyword-arg
|
|
||||||
elif encoder_type.lower() == 'residual_conv_bn':
|
elif encoder_type.lower() == 'residual_conv_bn':
|
||||||
self.encoder = ResidualConv1dBNEncoder(in_hidden_channels,
|
self.encoder = ResidualConv1dBNEncoder(in_hidden_channels,
|
||||||
out_channels,
|
out_channels,
|
||||||
|
|
|
@ -33,32 +33,32 @@ class SpeedySpeech(nn.Module):
|
||||||
external_c (bool, optional): enable external speaker embeddings. Defaults to False.
|
external_c (bool, optional): enable external speaker embeddings. Defaults to False.
|
||||||
c_in_channels (int, optional): number of channels in speaker embedding vectors. Defaults to 0.
|
c_in_channels (int, optional): number of channels in speaker embedding vectors. Defaults to 0.
|
||||||
"""
|
"""
|
||||||
# pylint: disable=dangerous-default-value
|
# pylint: disable=dangerous-default-value
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
num_chars,
|
num_chars,
|
||||||
out_channels,
|
out_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
positional_encoding=True,
|
positional_encoding=True,
|
||||||
length_scale=1,
|
length_scale=1,
|
||||||
encoder_type='residual_conv_bn',
|
encoder_type='residual_conv_bn',
|
||||||
encoder_params={
|
encoder_params={
|
||||||
"kernel_size": 4,
|
"kernel_size": 4,
|
||||||
"dilations": 4 * [1, 2, 4] + [1],
|
"dilations": 4 * [1, 2, 4] + [1],
|
||||||
"num_conv_blocks": 2,
|
"num_conv_blocks": 2,
|
||||||
"num_res_blocks": 13
|
"num_res_blocks": 13
|
||||||
},
|
},
|
||||||
decoder_type='residual_conv_bn',
|
decoder_type='residual_conv_bn',
|
||||||
decoder_params={
|
decoder_params={
|
||||||
"kernel_size": 4,
|
"kernel_size": 4,
|
||||||
"dilations": 4 * [1, 2, 4, 8] + [1],
|
"dilations": 4 * [1, 2, 4, 8] + [1],
|
||||||
"num_conv_blocks": 2,
|
"num_conv_blocks": 2,
|
||||||
"num_res_blocks": 17
|
"num_res_blocks": 17
|
||||||
},
|
},
|
||||||
num_speakers=0,
|
num_speakers=0,
|
||||||
external_c=False,
|
external_c=False,
|
||||||
c_in_channels=0):
|
c_in_channels=0):
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale
|
self.length_scale = float(length_scale) if isinstance(length_scale, int) else length_scale
|
||||||
|
@ -171,7 +171,7 @@ class SpeedySpeech(nn.Module):
|
||||||
"""
|
"""
|
||||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||||
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
||||||
o_de, attn= self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g)
|
o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g)
|
||||||
return o_de, o_dr_log.squeeze(1), attn
|
return o_de, o_dr_log.squeeze(1), attn
|
||||||
|
|
||||||
def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument
|
def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument
|
||||||
|
|
|
@ -10,7 +10,7 @@ import re
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
|
|
||||||
def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str:
|
def _num2chinese(num: str, big=False, simp=True, o=False, twoalt=False) -> str:
|
||||||
"""Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九)
|
"""Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -32,7 +32,7 @@ def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str:
|
||||||
nd = str(num)
|
nd = str(num)
|
||||||
if abs(float(nd)) >= 1e48:
|
if abs(float(nd)) >= 1e48:
|
||||||
raise ValueError('number out of range')
|
raise ValueError('number out of range')
|
||||||
elif 'e' in nd:
|
if 'e' in nd:
|
||||||
raise ValueError('scientific notation is not supported')
|
raise ValueError('scientific notation is not supported')
|
||||||
c_symbol = '正负点' if simp else '正負點'
|
c_symbol = '正负点' if simp else '正負點'
|
||||||
if o: # formal
|
if o: # formal
|
||||||
|
@ -69,7 +69,7 @@ def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str:
|
||||||
if int(unit) == 0: # 0000
|
if int(unit) == 0: # 0000
|
||||||
intresult.append(c_basic[0])
|
intresult.append(c_basic[0])
|
||||||
continue
|
continue
|
||||||
elif nu > 0 and int(unit) == 2: # 0002
|
if nu > 0 and int(unit) == 2: # 0002
|
||||||
intresult.append(c_twoalt + c_unit2[nu - 1])
|
intresult.append(c_twoalt + c_unit2[nu - 1])
|
||||||
continue
|
continue
|
||||||
ulist = []
|
ulist = []
|
||||||
|
|
|
@ -135,7 +135,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
def is_tacotron(c):
|
def is_tacotron(c):
|
||||||
return False if c['model'] in ['speedy_speech', 'glow_tts'] else True
|
return not c['model'] in ['speedy_speech', 'glow_tts']
|
||||||
|
|
||||||
def check_config_tts(c):
|
def check_config_tts(c):
|
||||||
check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech'], restricted=True, val_type=str)
|
check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech'], restricted=True, val_type=str)
|
||||||
|
|
|
@ -7,7 +7,7 @@ from TTS.utils.io import RenamingUnpickler
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False, eval=False):
|
def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False, eval=False): # pylint: disable=redefined-builtin
|
||||||
"""Load ```TTS.tts.models``` checkpoints.
|
"""Load ```TTS.tts.models``` checkpoints.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -98,7 +98,7 @@ def save_checkpoint(model, optimizer, current_step, epoch, r, output_folder,
|
||||||
|
|
||||||
|
|
||||||
def save_best_model(target_loss, best_loss, model, optimizer, current_step,
|
def save_best_model(target_loss, best_loss, model, optimizer, current_step,
|
||||||
epoch, r, output_folder, characters, **kwargs):
|
epoch, r, output_folder, characters, **kwargs):
|
||||||
"""Save model checkpoint, intended for saving the best model after each epoch.
|
"""Save model checkpoint, intended for saving the best model after each epoch.
|
||||||
It compares the current model loss with the best loss so far and saves the
|
It compares the current model loss with the best loss so far and saves the
|
||||||
model if the current loss is better.
|
model if the current loss is better.
|
||||||
|
|
|
@ -63,8 +63,8 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
|
||||||
speaker_embedding_dim = None
|
speaker_embedding_dim = None
|
||||||
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
||||||
num_speakers = len(speaker_mapping)
|
num_speakers = len(speaker_mapping)
|
||||||
print(" > Training with {} speakers: {}".format(len(speakers),
|
print(" > Training with {} speakers: {}".format(
|
||||||
", ".join(speakers)))
|
len(speakers), ", ".join(speakers)))
|
||||||
else:
|
else:
|
||||||
num_speakers = 0
|
num_speakers = 0
|
||||||
speaker_embedding_dim = None
|
speaker_embedding_dim = None
|
||||||
|
|
|
@ -17,17 +17,22 @@ def create_window(window_size, channel):
|
||||||
window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
|
window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
|
||||||
return window
|
return window
|
||||||
|
|
||||||
def _ssim(img1, img2, window, window_size, channel, size_average = True):
|
|
||||||
mu1 = F.conv2d(img1, window, padding = window_size//2, groups = channel)
|
def _ssim(img1, img2, window, window_size, channel, size_average=True):
|
||||||
mu2 = F.conv2d(img2, window, padding = window_size//2, groups = channel)
|
mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
|
||||||
|
mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
|
||||||
|
|
||||||
mu1_sq = mu1.pow(2)
|
mu1_sq = mu1.pow(2)
|
||||||
mu2_sq = mu2.pow(2)
|
mu2_sq = mu2.pow(2)
|
||||||
mu1_mu2 = mu1*mu2
|
mu1_mu2 = mu1*mu2
|
||||||
|
|
||||||
sigma1_sq = F.conv2d(img1*img1, window, padding = window_size//2, groups = channel) - mu1_sq
|
sigma1_sq = F.conv2d(
|
||||||
sigma2_sq = F.conv2d(img2*img2, window, padding = window_size//2, groups = channel) - mu2_sq
|
img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
|
||||||
sigma12 = F.conv2d(img1*img2, window, padding = window_size//2, groups = channel) - mu1_mu2
|
sigma2_sq = F.conv2d(
|
||||||
|
img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
|
||||||
|
sigma12 = F.conv2d(
|
||||||
|
img1 * img2, window, padding=window_size // 2,
|
||||||
|
groups=channel) - mu1_mu2
|
||||||
|
|
||||||
C1 = 0.01**2
|
C1 = 0.01**2
|
||||||
C2 = 0.03**2
|
C2 = 0.03**2
|
||||||
|
@ -39,7 +44,7 @@ def _ssim(img1, img2, window, window_size, channel, size_average = True):
|
||||||
return ssim_map.mean(1).mean(1).mean(1)
|
return ssim_map.mean(1).mean(1).mean(1)
|
||||||
|
|
||||||
class SSIM(torch.nn.Module):
|
class SSIM(torch.nn.Module):
|
||||||
def __init__(self, window_size = 11, size_average = True):
|
def __init__(self, window_size=11, size_average=True):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.window_size = window_size
|
self.window_size = window_size
|
||||||
self.size_average = size_average
|
self.size_average = size_average
|
||||||
|
@ -64,7 +69,8 @@ class SSIM(torch.nn.Module):
|
||||||
|
|
||||||
return _ssim(img1, img2, window, self.window_size, channel, self.size_average)
|
return _ssim(img1, img2, window, self.window_size, channel, self.size_average)
|
||||||
|
|
||||||
def ssim(img1, img2, window_size = 11, size_average = True):
|
|
||||||
|
def ssim(img1, img2, window_size=11, size_average=True):
|
||||||
(_, channel, _, _) = img1.size()
|
(_, channel, _, _) = img1.size()
|
||||||
window = create_window(window_size, channel)
|
window = create_window(window_size, channel)
|
||||||
|
|
||||||
|
|
|
@ -20,9 +20,13 @@ def text_to_seqvec(text, CONFIG):
|
||||||
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False),
|
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False),
|
||||||
dtype=np.int32)
|
dtype=np.int32)
|
||||||
else:
|
else:
|
||||||
seq = np.asarray(
|
seq = np.asarray(text_to_sequence(
|
||||||
text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None,
|
text,
|
||||||
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False), dtype=np.int32)
|
text_cleaner,
|
||||||
|
tp=CONFIG.characters if 'characters' in CONFIG.keys() else None,
|
||||||
|
add_blank=CONFIG['add_blank']
|
||||||
|
if 'add_blank' in CONFIG.keys() else False),
|
||||||
|
dtype=np.int32)
|
||||||
return seq
|
return seq
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,9 +81,9 @@ def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel
|
||||||
inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable
|
inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable
|
||||||
if hasattr(model, 'module'):
|
if hasattr(model, 'module'):
|
||||||
# distributed model
|
# distributed model
|
||||||
postnet_output, alignments= model.module.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
|
postnet_output, alignments = model.module.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
|
||||||
else:
|
else:
|
||||||
postnet_output, alignments= model.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
|
postnet_output, alignments = model.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
|
||||||
postnet_output = postnet_output.permute(0, 2, 1)
|
postnet_output = postnet_output.permute(0, 2, 1)
|
||||||
# these only belong to tacotron models.
|
# these only belong to tacotron models.
|
||||||
decoder_output = None
|
decoder_output = None
|
||||||
|
|
|
@ -2,60 +2,60 @@ import re
|
||||||
|
|
||||||
# List of (regular expression, replacement) pairs for abbreviations in english:
|
# List of (regular expression, replacement) pairs for abbreviations in english:
|
||||||
abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
|
abbreviations_en = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
|
||||||
for x in [
|
for x in [
|
||||||
('mrs', 'misess'),
|
('mrs', 'misess'),
|
||||||
('mr', 'mister'),
|
('mr', 'mister'),
|
||||||
('dr', 'doctor'),
|
('dr', 'doctor'),
|
||||||
('st', 'saint'),
|
('st', 'saint'),
|
||||||
('co', 'company'),
|
('co', 'company'),
|
||||||
('jr', 'junior'),
|
('jr', 'junior'),
|
||||||
('maj', 'major'),
|
('maj', 'major'),
|
||||||
('gen', 'general'),
|
('gen', 'general'),
|
||||||
('drs', 'doctors'),
|
('drs', 'doctors'),
|
||||||
('rev', 'reverend'),
|
('rev', 'reverend'),
|
||||||
('lt', 'lieutenant'),
|
('lt', 'lieutenant'),
|
||||||
('hon', 'honorable'),
|
('hon', 'honorable'),
|
||||||
('sgt', 'sergeant'),
|
('sgt', 'sergeant'),
|
||||||
('capt', 'captain'),
|
('capt', 'captain'),
|
||||||
('esq', 'esquire'),
|
('esq', 'esquire'),
|
||||||
('ltd', 'limited'),
|
('ltd', 'limited'),
|
||||||
('col', 'colonel'),
|
('col', 'colonel'),
|
||||||
('ft', 'fort'),
|
('ft', 'fort'),
|
||||||
]]
|
]]
|
||||||
|
|
||||||
# List of (regular expression, replacement) pairs for abbreviations in french:
|
# List of (regular expression, replacement) pairs for abbreviations in french:
|
||||||
abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
|
abbreviations_fr = [(re.compile('\\b%s\\.?' % x[0], re.IGNORECASE), x[1])
|
||||||
for x in [
|
for x in [
|
||||||
('M', 'monsieur'),
|
('M', 'monsieur'),
|
||||||
('Mlle', 'mademoiselle'),
|
('Mlle', 'mademoiselle'),
|
||||||
('Mlles', 'mesdemoiselles'),
|
('Mlles', 'mesdemoiselles'),
|
||||||
('Mme', 'Madame'),
|
('Mme', 'Madame'),
|
||||||
('Mmes', 'Mesdames'),
|
('Mmes', 'Mesdames'),
|
||||||
('N.B', 'nota bene'),
|
('N.B', 'nota bene'),
|
||||||
('M', 'monsieur'),
|
('M', 'monsieur'),
|
||||||
('p.c.q', 'parce que'),
|
('p.c.q', 'parce que'),
|
||||||
('Pr', 'professeur'),
|
('Pr', 'professeur'),
|
||||||
('qqch', 'quelque chose'),
|
('qqch', 'quelque chose'),
|
||||||
('rdv', 'rendez-vous'),
|
('rdv', 'rendez-vous'),
|
||||||
('max', 'maximum'),
|
('max', 'maximum'),
|
||||||
('min', 'minimum'),
|
('min', 'minimum'),
|
||||||
('no', 'numéro'),
|
('no', 'numéro'),
|
||||||
('adr', 'adresse'),
|
('adr', 'adresse'),
|
||||||
('dr', 'docteur'),
|
('dr', 'docteur'),
|
||||||
('st', 'saint'),
|
('st', 'saint'),
|
||||||
('co', 'companie'),
|
('co', 'companie'),
|
||||||
('jr', 'junior'),
|
('jr', 'junior'),
|
||||||
('sgt', 'sergent'),
|
('sgt', 'sergent'),
|
||||||
('capt', 'capitain'),
|
('capt', 'capitain'),
|
||||||
('col', 'colonel'),
|
('col', 'colonel'),
|
||||||
('av', 'avenue'),
|
('av', 'avenue'),
|
||||||
('av. J.-C', 'avant Jésus-Christ'),
|
('av. J.-C', 'avant Jésus-Christ'),
|
||||||
('apr. J.-C', 'après Jésus-Christ'),
|
('apr. J.-C', 'après Jésus-Christ'),
|
||||||
('art', 'article'),
|
('art', 'article'),
|
||||||
('boul', 'boulevard'),
|
('boul', 'boulevard'),
|
||||||
('c.-à-d', 'c’est-à-dire'),
|
('c.-à-d', 'c’est-à-dire'),
|
||||||
('etc', 'et cetera'),
|
('etc', 'et cetera'),
|
||||||
('ex', 'exemple'),
|
('ex', 'exemple'),
|
||||||
('excl', 'exclusivement'),
|
('excl', 'exclusivement'),
|
||||||
('boul', 'boulevard'),
|
('boul', 'boulevard'),
|
||||||
]]
|
]]
|
||||||
|
|
|
@ -22,7 +22,7 @@ class AttrDict(dict):
|
||||||
|
|
||||||
def read_json_with_comments(json_path):
|
def read_json_with_comments(json_path):
|
||||||
# fallback to json
|
# fallback to json
|
||||||
with open(json_path, "r", encoding = "utf-8") as f:
|
with open(json_path, "r", encoding="utf-8") as f:
|
||||||
input_str = f.read()
|
input_str = f.read()
|
||||||
# handle comments
|
# handle comments
|
||||||
input_str = re.sub(r'\\\n', '', input_str)
|
input_str = re.sub(r'\\\n', '', input_str)
|
||||||
|
@ -40,7 +40,7 @@ def load_config(config_path: str) -> AttrDict:
|
||||||
|
|
||||||
ext = os.path.splitext(config_path)[1]
|
ext = os.path.splitext(config_path)[1]
|
||||||
if ext in (".yml", ".yaml"):
|
if ext in (".yml", ".yaml"):
|
||||||
with open(config_path, "r", encoding = "utf-8") as f:
|
with open(config_path, "r", encoding="utf-8") as f:
|
||||||
data = yaml.safe_load(f)
|
data = yaml.safe_load(f)
|
||||||
else:
|
else:
|
||||||
data = read_json_with_comments(config_path)
|
data = read_json_with_comments(config_path)
|
||||||
|
@ -61,7 +61,7 @@ def copy_model_files(c, config_file, out_path, new_fields):
|
||||||
"""
|
"""
|
||||||
# copy config.json
|
# copy config.json
|
||||||
copy_config_path = os.path.join(out_path, 'config.json')
|
copy_config_path = os.path.join(out_path, 'config.json')
|
||||||
config_lines = open(config_file, "r", encoding = "utf-8").readlines()
|
config_lines = open(config_file, "r", encoding="utf-8").readlines()
|
||||||
# add extra information fields
|
# add extra information fields
|
||||||
for key, value in new_fields.items():
|
for key, value in new_fields.items():
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
|
|
|
@ -144,8 +144,3 @@ class ModelManager(object):
|
||||||
if isinstance(key, str) and len(my_dict[key]) > 0:
|
if isinstance(key, str) and len(my_dict[key]) > 0:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
|
||||||
class TorchSTFT(nn.Module):
|
class TorchSTFT(nn.Module): # pylint: disable=abstract-method
|
||||||
def __init__(self, n_fft, hop_length, win_length, window='hann_window'):
|
def __init__(self, n_fft, hop_length, win_length, window='hann_window'):
|
||||||
""" Torch based STFT operation """
|
""" Torch based STFT operation """
|
||||||
super(TorchSTFT, self).__init__()
|
super(TorchSTFT, self).__init__()
|
||||||
|
|
|
@ -22,8 +22,10 @@ class PositionalEncoding(nn.Module):
|
||||||
|
|
||||||
def forward(self, x, noise_level):
|
def forward(self, x, noise_level):
|
||||||
if x.shape[2] > self.pe.shape[1]:
|
if x.shape[2] > self.pe.shape[1]:
|
||||||
self.init_pe_matrix(x.shape[1] ,x.shape[2], x)
|
self.init_pe_matrix(x.shape[1], x.shape[2], x)
|
||||||
return x + noise_level[..., None, None] + self.pe[:, :x.size(2)].repeat(x.shape[0], 1, 1) / self.C
|
return x + noise_level[..., None,
|
||||||
|
None] + self.pe[:, :x.size(2)].repeat(
|
||||||
|
x.shape[0], 1, 1) / self.C
|
||||||
|
|
||||||
def init_pe_matrix(self, n_channels, max_len, x):
|
def init_pe_matrix(self, n_channels, max_len, x):
|
||||||
pe = torch.zeros(max_len, n_channels)
|
pe = torch.zeros(max_len, n_channels)
|
||||||
|
@ -171,5 +173,4 @@ class DBlock(nn.Module):
|
||||||
self.res_block = weight_norm(self.res_block)
|
self.res_block = weight_norm(self.res_block)
|
||||||
for idx, layer in enumerate(self.main_block):
|
for idx, layer in enumerate(self.main_block):
|
||||||
if len(layer.state_dict()) != 0:
|
if len(layer.state_dict()) != 0:
|
||||||
self.main_block[idx] = weight_norm(layer)
|
self.main_block[idx] = weight_norm(layer)
|
||||||
|
|
||||||
|
|
|
@ -79,7 +79,7 @@ class Wavegrad(nn.Module):
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def load_noise_schedule(self, path):
|
def load_noise_schedule(self, path):
|
||||||
beta = np.load(path, allow_pickle=True).item()['beta']
|
beta = np.load(path, allow_pickle=True).item()['beta'] # pylint: disable=unexpected-keyword-arg
|
||||||
self.compute_noise_level(beta)
|
self.compute_noise_level(beta)
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
|
@ -91,8 +91,8 @@ class Wavegrad(nn.Module):
|
||||||
y_n = torch.FloatTensor(y_n).unsqueeze(0).unsqueeze(0).to(x)
|
y_n = torch.FloatTensor(y_n).unsqueeze(0).unsqueeze(0).to(x)
|
||||||
sqrt_alpha_hat = self.noise_level.to(x)
|
sqrt_alpha_hat = self.noise_level.to(x)
|
||||||
for n in range(len(self.alpha) - 1, -1, -1):
|
for n in range(len(self.alpha) - 1, -1, -1):
|
||||||
y_n = self.c1[n] * (y_n -
|
y_n = self.c1[n] * (y_n - self.c2[n] * self.forward(
|
||||||
self.c2[n] * self.forward(y_n, x, sqrt_alpha_hat[n].repeat(x.shape[0])))
|
y_n, x, sqrt_alpha_hat[n].repeat(x.shape[0])))
|
||||||
if n > 0:
|
if n > 0:
|
||||||
z = torch.randn_like(y_n)
|
z = torch.randn_like(y_n)
|
||||||
y_n += self.sigma[n - 1] * z
|
y_n += self.sigma[n - 1] * z
|
||||||
|
|
|
@ -73,15 +73,15 @@ class Stretch2d(nn.Module):
|
||||||
|
|
||||||
class UpsampleNetwork(nn.Module):
|
class UpsampleNetwork(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
feat_dims,
|
feat_dims,
|
||||||
upsample_scales,
|
upsample_scales,
|
||||||
compute_dims,
|
compute_dims,
|
||||||
num_res_blocks,
|
num_res_blocks,
|
||||||
res_out_dims,
|
res_out_dims,
|
||||||
pad,
|
pad,
|
||||||
use_aux_net,
|
use_aux_net,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.total_scale = np.cumproduct(upsample_scales)[-1]
|
self.total_scale = np.cumproduct(upsample_scales)[-1]
|
||||||
self.indent = pad * self.total_scale
|
self.indent = pad * self.total_scale
|
||||||
|
@ -118,9 +118,8 @@ class UpsampleNetwork(nn.Module):
|
||||||
|
|
||||||
|
|
||||||
class Upsample(nn.Module):
|
class Upsample(nn.Module):
|
||||||
def __init__(
|
def __init__(self, scale, pad, num_res_blocks, feat_dims, compute_dims,
|
||||||
self, scale, pad, num_res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net
|
res_out_dims, use_aux_net):
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.pad = pad
|
self.pad = pad
|
||||||
|
|
|
@ -44,9 +44,11 @@ def log_sum_exp(x):
|
||||||
|
|
||||||
|
|
||||||
# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
|
# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
|
||||||
def discretized_mix_logistic_loss(
|
def discretized_mix_logistic_loss(y_hat,
|
||||||
y_hat, y, num_classes=65536, log_scale_min=None, reduce=True
|
y,
|
||||||
):
|
num_classes=65536,
|
||||||
|
log_scale_min=None,
|
||||||
|
reduce=True):
|
||||||
if log_scale_min is None:
|
if log_scale_min is None:
|
||||||
log_scale_min = float(np.log(1e-14))
|
log_scale_min = float(np.log(1e-14))
|
||||||
y_hat = y_hat.permute(0, 2, 1)
|
y_hat = y_hat.permute(0, 2, 1)
|
||||||
|
|
|
@ -7,7 +7,7 @@ import pickle as pickle_tts
|
||||||
from TTS.utils.io import RenamingUnpickler
|
from TTS.utils.io import RenamingUnpickler
|
||||||
|
|
||||||
|
|
||||||
def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False):
|
def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False): # pylint: disable=redefined-builtin
|
||||||
try:
|
try:
|
||||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
|
|
|
@ -166,7 +166,7 @@ class SSIMLossTests(unittest.TestCase):
|
||||||
dummy_target = T.zeros(4, 8, 128).float()
|
dummy_target = T.zeros(4, 8, 128).float()
|
||||||
dummy_length = (T.ones(4) * 8).long()
|
dummy_length = (T.ones(4) * 8).long()
|
||||||
output = layer(dummy_input, dummy_target, dummy_length)
|
output = layer(dummy_input, dummy_target, dummy_length)
|
||||||
assert abs(output.item() - 1.0) < 1e-4 , "1.0 vs {}".format(output.item())
|
assert abs(output.item() - 1.0) < 1e-4, "1.0 vs {}".format(output.item())
|
||||||
|
|
||||||
# test if padded values of input makes any difference
|
# test if padded values of input makes any difference
|
||||||
dummy_input = T.ones(4, 8, 128).float()
|
dummy_input = T.ones(4, 8, 128).float()
|
||||||
|
@ -217,4 +217,3 @@ class SSIMLossTests(unittest.TestCase):
|
||||||
(sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
|
(sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
|
||||||
output = layer(dummy_input + mask, dummy_target, dummy_length)
|
output = layer(dummy_input + mask, dummy_target, dummy_length)
|
||||||
assert output.item() == 0, "0 vs {}".format(output.item())
|
assert output.item() == 0, "0 vs {}".format(output.item())
|
||||||
|
|
||||||
|
|
|
@ -161,7 +161,7 @@ def test_speedy_speech():
|
||||||
x_lengths,
|
x_lengths,
|
||||||
y_lengths,
|
y_lengths,
|
||||||
durations,
|
durations,
|
||||||
g=torch.rand((B,256)).to(device))
|
g=torch.rand((B, 256)).to(device))
|
||||||
|
|
||||||
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
|
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
|
||||||
assert list(attn.shape) == [B, T_de, T_en]
|
assert list(attn.shape) == [B, T_de, T_en]
|
||||||
|
|
|
@ -356,4 +356,3 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
|
||||||
), "param {} with shape {} not updated!! \n{}\n{}".format(
|
), "param {} with shape {} not updated!! \n{}\n{}".format(
|
||||||
count, param.shape, param, param_ref)
|
count, param.shape, param, param_ref)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
|
|
|
@ -17,5 +17,5 @@ def test_currency() -> None:
|
||||||
|
|
||||||
|
|
||||||
def test_expand_numbers() -> None:
|
def test_expand_numbers() -> None:
|
||||||
assert "minus one" == phoneme_cleaners("-1")
|
assert phoneme_cleaners("-1") == 'minus one'
|
||||||
assert "one" == phoneme_cleaners("1")
|
assert phoneme_cleaners("1") == 'one'
|
||||||
|
|
|
@ -17,7 +17,7 @@ def test_phoneme_to_sequence():
|
||||||
lang = "en-us"
|
lang = "en-us"
|
||||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||||
text_hat = sequence_to_phoneme(sequence)
|
text_hat = sequence_to_phoneme(sequence)
|
||||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||||
gt = 'ɹiːsənt ɹᵻsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪŋkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹᵻspɑːnsᵻbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjʊleɪʃən ænd lɜːnɪŋ!'
|
gt = 'ɹiːsənt ɹᵻsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪŋkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹᵻspɑːnsᵻbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjʊleɪʃən ænd lɜːnɪŋ!'
|
||||||
assert text_hat == text_hat_with_params == gt
|
assert text_hat == text_hat_with_params == gt
|
||||||
|
|
|
@ -20,18 +20,18 @@ class WavegradTrainTest(unittest.TestCase):
|
||||||
|
|
||||||
criterion = torch.nn.L1Loss().to(device)
|
criterion = torch.nn.L1Loss().to(device)
|
||||||
model = Wavegrad(in_channels=80,
|
model = Wavegrad(in_channels=80,
|
||||||
out_channels=1,
|
out_channels=1,
|
||||||
upsample_factors=[5, 5, 3, 2, 2],
|
upsample_factors=[5, 5, 3, 2, 2],
|
||||||
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2],
|
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2],
|
||||||
[1, 2, 4, 8], [1, 2, 4, 8],
|
[1, 2, 4, 8], [1, 2, 4, 8],
|
||||||
[1, 2, 4, 8]])
|
[1, 2, 4, 8]])
|
||||||
|
|
||||||
model_ref = Wavegrad(in_channels=80,
|
model_ref = Wavegrad(in_channels=80,
|
||||||
out_channels=1,
|
out_channels=1,
|
||||||
upsample_factors=[5, 5, 3, 2, 2],
|
upsample_factors=[5, 5, 3, 2, 2],
|
||||||
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2],
|
upsample_dilations=[[1, 2, 1, 2], [1, 2, 1, 2],
|
||||||
[1, 2, 4, 8], [1, 2, 4, 8],
|
[1, 2, 4, 8], [1, 2, 4, 8],
|
||||||
[1, 2, 4, 8]])
|
[1, 2, 4, 8]])
|
||||||
model.train()
|
model.train()
|
||||||
model.to(device)
|
model.to(device)
|
||||||
betas = np.linspace(1e-6, 1e-2, 1000)
|
betas = np.linspace(1e-6, 1e-2, 1000)
|
||||||
|
|
Loading…
Reference in New Issue