mirror of https://github.com/coqui-ai/TTS.git
a ton of linter updates
This commit is contained in:
parent
4422642ec0
commit
9a48ba3821
|
@ -170,7 +170,7 @@ def main():
|
||||||
args.vocoder_name = model_item['default_vocoder'] if args.vocoder_name is None else args.vocoder_name
|
args.vocoder_name = model_item['default_vocoder'] if args.vocoder_name is None else args.vocoder_name
|
||||||
|
|
||||||
if args.vocoder_name is not None:
|
if args.vocoder_name is not None:
|
||||||
vocoder_path, vocoder_config_path, vocoder_item = manager.download_model(args.vocoder_name)
|
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)
|
||||||
|
|
||||||
# CASE3: load custome models
|
# CASE3: load custome models
|
||||||
if args.model_path is not None:
|
if args.model_path is not None:
|
||||||
|
|
|
@ -573,7 +573,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
if c.run_eval:
|
if c.run_eval:
|
||||||
target_loss = eval_avg_loss_dict['avg_loss']
|
target_loss = eval_avg_loss_dict['avg_loss']
|
||||||
best_loss = save_best_model(target_loss, best_loss, model, optimizer,
|
best_loss = save_best_model(target_loss, best_loss, model, optimizer,
|
||||||
global_step, epoch, c.r, OUT_PATH,
|
global_step, epoch, c.r, OUT_PATH, model_characters,
|
||||||
keep_all_best=keep_all_best, keep_after=keep_after)
|
keep_all_best=keep_all_best, keep_after=keep_after)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,6 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import argparse
|
|
||||||
import glob
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
@ -535,7 +533,7 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
if c.run_eval:
|
if c.run_eval:
|
||||||
target_loss = eval_avg_loss_dict['avg_loss']
|
target_loss = eval_avg_loss_dict['avg_loss']
|
||||||
best_loss = save_best_model(target_loss, best_loss, model, optimizer,
|
best_loss = save_best_model(target_loss, best_loss, model, optimizer,
|
||||||
global_step, epoch, c.r, OUT_PATH,
|
global_step, epoch, c.r, OUT_PATH, model_characters,
|
||||||
keep_all_best=keep_all_best, keep_after=keep_after)
|
keep_all_best=keep_all_best, keep_after=keep_after)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -648,12 +648,14 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
epoch,
|
epoch,
|
||||||
c.r,
|
c.r,
|
||||||
OUT_PATH,
|
OUT_PATH,
|
||||||
|
model_characters,
|
||||||
keep_all_best=keep_all_best,
|
keep_all_best=keep_all_best,
|
||||||
keep_after=keep_after,
|
keep_after=keep_after,
|
||||||
scaler=scaler.state_dict() if c.mixed_precision else None
|
scaler=scaler.state_dict() if c.mixed_precision else None
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
args = parse_arguments(sys.argv)
|
args = parse_arguments(sys.argv)
|
||||||
c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
|
c, OUT_PATH, AUDIO_PATH, c_logger, tb_logger = process_args(
|
||||||
|
|
|
@ -50,7 +50,7 @@ def setup_loader(ap, is_val=False, verbose=False):
|
||||||
sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None
|
sampler = DistributedSampler(dataset, shuffle=True) if num_gpus > 1 else None
|
||||||
loader = DataLoader(dataset,
|
loader = DataLoader(dataset,
|
||||||
batch_size=1 if is_val else c.batch_size,
|
batch_size=1 if is_val else c.batch_size,
|
||||||
shuffle=False if num_gpus > 1 else True,
|
shuffle=num_gpus == 0,
|
||||||
drop_last=False,
|
drop_last=False,
|
||||||
sampler=sampler,
|
sampler=sampler,
|
||||||
num_workers=c.num_val_loader_workers
|
num_workers=c.num_val_loader_workers
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import collections
|
import collections
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
from multiprocessing import Manager, Pool
|
from multiprocessing import Pool
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
|
@ -3,7 +3,7 @@ from glob import glob
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Tuple
|
from typing import List
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
|
@ -97,7 +97,7 @@ class ResidualConv1dBNBlock(nn.Module):
|
||||||
assert len(dilations) == num_res_blocks
|
assert len(dilations) == num_res_blocks
|
||||||
self.res_blocks = nn.ModuleList()
|
self.res_blocks = nn.ModuleList()
|
||||||
for idx, dilation in enumerate(dilations):
|
for idx, dilation in enumerate(dilations):
|
||||||
block = Conv1dBNBlock(in_channels if idx==0 else hidden_channels,
|
block = Conv1dBNBlock(in_channels if idx == 0 else hidden_channels,
|
||||||
out_channels if (idx + 1) == len(dilations) else hidden_channels,
|
out_channels if (idx + 1) == len(dilations) else hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
kernel_size,
|
kernel_size,
|
||||||
|
|
|
@ -366,8 +366,10 @@ class RelativePositionTransformer(nn.Module):
|
||||||
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||||
|
|
||||||
self.ffn_layers.append(
|
self.ffn_layers.append(
|
||||||
FeedForwardNetwork(hidden_channels,
|
FeedForwardNetwork(
|
||||||
hidden_channels if (idx + 1) != self.num_layers else out_channels,
|
hidden_channels,
|
||||||
|
hidden_channels if
|
||||||
|
(idx + 1) != self.num_layers else out_channels,
|
||||||
hidden_channels_ffn,
|
hidden_channels_ffn,
|
||||||
kernel_size,
|
kernel_size,
|
||||||
dropout_p=dropout_p))
|
dropout_p=dropout_p))
|
||||||
|
|
|
@ -75,7 +75,7 @@ class ReferenceEncoder(nn.Module):
|
||||||
# x: 3D tensor [batch_size, post_conv_width,
|
# x: 3D tensor [batch_size, post_conv_width,
|
||||||
# num_channels*post_conv_height]
|
# num_channels*post_conv_height]
|
||||||
self.recurrence.flatten_parameters()
|
self.recurrence.flatten_parameters()
|
||||||
memory, out = self.recurrence(x)
|
_, out = self.recurrence(x)
|
||||||
# out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
|
# out: 3D tensor [seq_len==1, batch_size, encoding_size=128]
|
||||||
|
|
||||||
return out.squeeze(0)
|
return out.squeeze(0)
|
||||||
|
|
|
@ -2,13 +2,12 @@ import math
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
from inspect import signature
|
|
||||||
from torch.nn import functional
|
from torch.nn import functional
|
||||||
from TTS.tts.utils.generic_utils import sequence_mask
|
from TTS.tts.utils.generic_utils import sequence_mask
|
||||||
from TTS.tts.utils.ssim import ssim
|
from TTS.tts.utils.ssim import ssim
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=abstract-method Method
|
# pylint: disable=abstract-method
|
||||||
# relates https://github.com/pytorch/pytorch/issues/42305
|
# relates https://github.com/pytorch/pytorch/issues/42305
|
||||||
class L1LossMasked(nn.Module):
|
class L1LossMasked(nn.Module):
|
||||||
def __init__(self, seq_len_norm):
|
def __init__(self, seq_len_norm):
|
||||||
|
|
|
@ -78,8 +78,7 @@ class RelativePositionTransformerEncoder(nn.Module):
|
||||||
kernel_size=5,
|
kernel_size=5,
|
||||||
num_res_blocks=3,
|
num_res_blocks=3,
|
||||||
num_conv_blocks=1,
|
num_conv_blocks=1,
|
||||||
dilations=[1, 1, 1]
|
dilations=[1, 1, 1])
|
||||||
)
|
|
||||||
self.rel_pos_transformer = RelativePositionTransformer(
|
self.rel_pos_transformer = RelativePositionTransformer(
|
||||||
hidden_channels, out_channels, hidden_channels, **params)
|
hidden_channels, out_channels, hidden_channels, **params)
|
||||||
|
|
||||||
|
@ -104,8 +103,7 @@ class ResidualConv1dBNEncoder(nn.Module):
|
||||||
"""
|
"""
|
||||||
def __init__(self, in_channels, out_channels, hidden_channels, params):
|
def __init__(self, in_channels, out_channels, hidden_channels, params):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.prenet = nn.Sequential(
|
self.prenet = nn.Sequential(nn.Conv1d(in_channels, hidden_channels, 1),
|
||||||
nn.Conv1d(in_channels, hidden_channels, 1),
|
|
||||||
nn.ReLU())
|
nn.ReLU())
|
||||||
self.res_conv_block = ResidualConv1dBNBlock(hidden_channels,
|
self.res_conv_block = ResidualConv1dBNBlock(hidden_channels,
|
||||||
hidden_channels,
|
hidden_channels,
|
||||||
|
@ -183,9 +181,8 @@ class Encoder(nn.Module):
|
||||||
# init encoder
|
# init encoder
|
||||||
if encoder_type.lower() == "transformer":
|
if encoder_type.lower() == "transformer":
|
||||||
# text encoder
|
# text encoder
|
||||||
self.encoder = RelativePositionTransformerEncoder(in_hidden_channels,
|
self.encoder = RelativePositionTransformerEncoder(
|
||||||
out_channels,
|
in_hidden_channels, out_channels, in_hidden_channels,
|
||||||
in_hidden_channels,
|
|
||||||
encoder_params) # pylint: disable=unexpected-keyword-arg
|
encoder_params) # pylint: disable=unexpected-keyword-arg
|
||||||
elif encoder_type.lower() == 'residual_conv_bn':
|
elif encoder_type.lower() == 'residual_conv_bn':
|
||||||
self.encoder = ResidualConv1dBNEncoder(in_hidden_channels,
|
self.encoder = ResidualConv1dBNEncoder(in_hidden_channels,
|
||||||
|
|
|
@ -33,7 +33,7 @@ class SpeedySpeech(nn.Module):
|
||||||
external_c (bool, optional): enable external speaker embeddings. Defaults to False.
|
external_c (bool, optional): enable external speaker embeddings. Defaults to False.
|
||||||
c_in_channels (int, optional): number of channels in speaker embedding vectors. Defaults to 0.
|
c_in_channels (int, optional): number of channels in speaker embedding vectors. Defaults to 0.
|
||||||
"""
|
"""
|
||||||
# pylint: disable=dangerous-default-value
|
# pylint: disable=dangerous-default-value
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -171,7 +171,7 @@ class SpeedySpeech(nn.Module):
|
||||||
"""
|
"""
|
||||||
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
o_en, o_en_dp, x_mask, g = self._forward_encoder(x, x_lengths, g)
|
||||||
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
o_dr_log = self.duration_predictor(o_en_dp.detach(), x_mask)
|
||||||
o_de, attn= self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g)
|
o_de, attn = self._forward_decoder(o_en, o_en_dp, dr, x_mask, y_lengths, g=g)
|
||||||
return o_de, o_dr_log.squeeze(1), attn
|
return o_de, o_dr_log.squeeze(1), attn
|
||||||
|
|
||||||
def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument
|
def inference(self, x, x_lengths, g=None): # pylint: disable=unused-argument
|
||||||
|
|
|
@ -10,7 +10,7 @@ import re
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
|
|
||||||
def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str:
|
def _num2chinese(num: str, big=False, simp=True, o=False, twoalt=False) -> str:
|
||||||
"""Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九)
|
"""Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九)
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -32,7 +32,7 @@ def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str:
|
||||||
nd = str(num)
|
nd = str(num)
|
||||||
if abs(float(nd)) >= 1e48:
|
if abs(float(nd)) >= 1e48:
|
||||||
raise ValueError('number out of range')
|
raise ValueError('number out of range')
|
||||||
elif 'e' in nd:
|
if 'e' in nd:
|
||||||
raise ValueError('scientific notation is not supported')
|
raise ValueError('scientific notation is not supported')
|
||||||
c_symbol = '正负点' if simp else '正負點'
|
c_symbol = '正负点' if simp else '正負點'
|
||||||
if o: # formal
|
if o: # formal
|
||||||
|
@ -69,7 +69,7 @@ def _num2chinese(num :str, big=False, simp=True, o=False, twoalt=False) -> str:
|
||||||
if int(unit) == 0: # 0000
|
if int(unit) == 0: # 0000
|
||||||
intresult.append(c_basic[0])
|
intresult.append(c_basic[0])
|
||||||
continue
|
continue
|
||||||
elif nu > 0 and int(unit) == 2: # 0002
|
if nu > 0 and int(unit) == 2: # 0002
|
||||||
intresult.append(c_twoalt + c_unit2[nu - 1])
|
intresult.append(c_twoalt + c_unit2[nu - 1])
|
||||||
continue
|
continue
|
||||||
ulist = []
|
ulist = []
|
||||||
|
|
|
@ -135,7 +135,7 @@ def setup_model(num_chars, num_speakers, c, speaker_embedding_dim=None):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
def is_tacotron(c):
|
def is_tacotron(c):
|
||||||
return False if c['model'] in ['speedy_speech', 'glow_tts'] else True
|
return not c['model'] in ['speedy_speech', 'glow_tts']
|
||||||
|
|
||||||
def check_config_tts(c):
|
def check_config_tts(c):
|
||||||
check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech'], restricted=True, val_type=str)
|
check_argument('model', c, enum_list=['tacotron', 'tacotron2', 'glow_tts', 'speedy_speech'], restricted=True, val_type=str)
|
||||||
|
|
|
@ -7,7 +7,7 @@ from TTS.utils.io import RenamingUnpickler
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False, eval=False):
|
def load_checkpoint(model, checkpoint_path, amp=None, use_cuda=False, eval=False): # pylint: disable=redefined-builtin
|
||||||
"""Load ```TTS.tts.models``` checkpoints.
|
"""Load ```TTS.tts.models``` checkpoints.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
|
@ -63,8 +63,8 @@ def parse_speakers(c, args, meta_data_train, OUT_PATH):
|
||||||
speaker_embedding_dim = None
|
speaker_embedding_dim = None
|
||||||
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
||||||
num_speakers = len(speaker_mapping)
|
num_speakers = len(speaker_mapping)
|
||||||
print(" > Training with {} speakers: {}".format(len(speakers),
|
print(" > Training with {} speakers: {}".format(
|
||||||
", ".join(speakers)))
|
len(speakers), ", ".join(speakers)))
|
||||||
else:
|
else:
|
||||||
num_speakers = 0
|
num_speakers = 0
|
||||||
speaker_embedding_dim = None
|
speaker_embedding_dim = None
|
||||||
|
|
|
@ -17,17 +17,22 @@ def create_window(window_size, channel):
|
||||||
window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
|
window = Variable(_2D_window.expand(channel, 1, window_size, window_size).contiguous())
|
||||||
return window
|
return window
|
||||||
|
|
||||||
def _ssim(img1, img2, window, window_size, channel, size_average = True):
|
|
||||||
mu1 = F.conv2d(img1, window, padding = window_size//2, groups = channel)
|
def _ssim(img1, img2, window, window_size, channel, size_average=True):
|
||||||
mu2 = F.conv2d(img2, window, padding = window_size//2, groups = channel)
|
mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
|
||||||
|
mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
|
||||||
|
|
||||||
mu1_sq = mu1.pow(2)
|
mu1_sq = mu1.pow(2)
|
||||||
mu2_sq = mu2.pow(2)
|
mu2_sq = mu2.pow(2)
|
||||||
mu1_mu2 = mu1*mu2
|
mu1_mu2 = mu1*mu2
|
||||||
|
|
||||||
sigma1_sq = F.conv2d(img1*img1, window, padding = window_size//2, groups = channel) - mu1_sq
|
sigma1_sq = F.conv2d(
|
||||||
sigma2_sq = F.conv2d(img2*img2, window, padding = window_size//2, groups = channel) - mu2_sq
|
img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
|
||||||
sigma12 = F.conv2d(img1*img2, window, padding = window_size//2, groups = channel) - mu1_mu2
|
sigma2_sq = F.conv2d(
|
||||||
|
img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
|
||||||
|
sigma12 = F.conv2d(
|
||||||
|
img1 * img2, window, padding=window_size // 2,
|
||||||
|
groups=channel) - mu1_mu2
|
||||||
|
|
||||||
C1 = 0.01**2
|
C1 = 0.01**2
|
||||||
C2 = 0.03**2
|
C2 = 0.03**2
|
||||||
|
@ -39,7 +44,7 @@ def _ssim(img1, img2, window, window_size, channel, size_average = True):
|
||||||
return ssim_map.mean(1).mean(1).mean(1)
|
return ssim_map.mean(1).mean(1).mean(1)
|
||||||
|
|
||||||
class SSIM(torch.nn.Module):
|
class SSIM(torch.nn.Module):
|
||||||
def __init__(self, window_size = 11, size_average = True):
|
def __init__(self, window_size=11, size_average=True):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.window_size = window_size
|
self.window_size = window_size
|
||||||
self.size_average = size_average
|
self.size_average = size_average
|
||||||
|
@ -64,7 +69,8 @@ class SSIM(torch.nn.Module):
|
||||||
|
|
||||||
return _ssim(img1, img2, window, self.window_size, channel, self.size_average)
|
return _ssim(img1, img2, window, self.window_size, channel, self.size_average)
|
||||||
|
|
||||||
def ssim(img1, img2, window_size = 11, size_average = True):
|
|
||||||
|
def ssim(img1, img2, window_size=11, size_average=True):
|
||||||
(_, channel, _, _) = img1.size()
|
(_, channel, _, _) = img1.size()
|
||||||
window = create_window(window_size, channel)
|
window = create_window(window_size, channel)
|
||||||
|
|
||||||
|
|
|
@ -20,9 +20,13 @@ def text_to_seqvec(text, CONFIG):
|
||||||
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False),
|
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False),
|
||||||
dtype=np.int32)
|
dtype=np.int32)
|
||||||
else:
|
else:
|
||||||
seq = np.asarray(
|
seq = np.asarray(text_to_sequence(
|
||||||
text_to_sequence(text, text_cleaner, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None,
|
text,
|
||||||
add_blank=CONFIG['add_blank'] if 'add_blank' in CONFIG.keys() else False), dtype=np.int32)
|
text_cleaner,
|
||||||
|
tp=CONFIG.characters if 'characters' in CONFIG.keys() else None,
|
||||||
|
add_blank=CONFIG['add_blank']
|
||||||
|
if 'add_blank' in CONFIG.keys() else False),
|
||||||
|
dtype=np.int32)
|
||||||
return seq
|
return seq
|
||||||
|
|
||||||
|
|
||||||
|
@ -77,9 +81,9 @@ def run_model_torch(model, inputs, CONFIG, truncated, speaker_id=None, style_mel
|
||||||
inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable
|
inputs_lengths = torch.tensor(inputs.shape[1:2]).to(inputs.device) # pylint: disable=not-callable
|
||||||
if hasattr(model, 'module'):
|
if hasattr(model, 'module'):
|
||||||
# distributed model
|
# distributed model
|
||||||
postnet_output, alignments= model.module.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
|
postnet_output, alignments = model.module.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
|
||||||
else:
|
else:
|
||||||
postnet_output, alignments= model.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
|
postnet_output, alignments = model.inference(inputs, inputs_lengths, g=speaker_id if speaker_id is not None else speaker_embeddings)
|
||||||
postnet_output = postnet_output.permute(0, 2, 1)
|
postnet_output = postnet_output.permute(0, 2, 1)
|
||||||
# these only belong to tacotron models.
|
# these only belong to tacotron models.
|
||||||
decoder_output = None
|
decoder_output = None
|
||||||
|
|
|
@ -22,7 +22,7 @@ class AttrDict(dict):
|
||||||
|
|
||||||
def read_json_with_comments(json_path):
|
def read_json_with_comments(json_path):
|
||||||
# fallback to json
|
# fallback to json
|
||||||
with open(json_path, "r", encoding = "utf-8") as f:
|
with open(json_path, "r", encoding="utf-8") as f:
|
||||||
input_str = f.read()
|
input_str = f.read()
|
||||||
# handle comments
|
# handle comments
|
||||||
input_str = re.sub(r'\\\n', '', input_str)
|
input_str = re.sub(r'\\\n', '', input_str)
|
||||||
|
@ -40,7 +40,7 @@ def load_config(config_path: str) -> AttrDict:
|
||||||
|
|
||||||
ext = os.path.splitext(config_path)[1]
|
ext = os.path.splitext(config_path)[1]
|
||||||
if ext in (".yml", ".yaml"):
|
if ext in (".yml", ".yaml"):
|
||||||
with open(config_path, "r", encoding = "utf-8") as f:
|
with open(config_path, "r", encoding="utf-8") as f:
|
||||||
data = yaml.safe_load(f)
|
data = yaml.safe_load(f)
|
||||||
else:
|
else:
|
||||||
data = read_json_with_comments(config_path)
|
data = read_json_with_comments(config_path)
|
||||||
|
@ -61,7 +61,7 @@ def copy_model_files(c, config_file, out_path, new_fields):
|
||||||
"""
|
"""
|
||||||
# copy config.json
|
# copy config.json
|
||||||
copy_config_path = os.path.join(out_path, 'config.json')
|
copy_config_path = os.path.join(out_path, 'config.json')
|
||||||
config_lines = open(config_file, "r", encoding = "utf-8").readlines()
|
config_lines = open(config_file, "r", encoding="utf-8").readlines()
|
||||||
# add extra information fields
|
# add extra information fields
|
||||||
for key, value in new_fields.items():
|
for key, value in new_fields.items():
|
||||||
if isinstance(value, str):
|
if isinstance(value, str):
|
||||||
|
|
|
@ -144,8 +144,3 @@ class ModelManager(object):
|
||||||
if isinstance(key, str) and len(my_dict[key]) > 0:
|
if isinstance(key, str) and len(my_dict[key]) > 0:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ from torch import nn
|
||||||
from torch.nn import functional as F
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
|
||||||
class TorchSTFT(nn.Module):
|
class TorchSTFT(nn.Module): # pylint: disable=abstract-method
|
||||||
def __init__(self, n_fft, hop_length, win_length, window='hann_window'):
|
def __init__(self, n_fft, hop_length, win_length, window='hann_window'):
|
||||||
""" Torch based STFT operation """
|
""" Torch based STFT operation """
|
||||||
super(TorchSTFT, self).__init__()
|
super(TorchSTFT, self).__init__()
|
||||||
|
|
|
@ -22,8 +22,10 @@ class PositionalEncoding(nn.Module):
|
||||||
|
|
||||||
def forward(self, x, noise_level):
|
def forward(self, x, noise_level):
|
||||||
if x.shape[2] > self.pe.shape[1]:
|
if x.shape[2] > self.pe.shape[1]:
|
||||||
self.init_pe_matrix(x.shape[1] ,x.shape[2], x)
|
self.init_pe_matrix(x.shape[1], x.shape[2], x)
|
||||||
return x + noise_level[..., None, None] + self.pe[:, :x.size(2)].repeat(x.shape[0], 1, 1) / self.C
|
return x + noise_level[..., None,
|
||||||
|
None] + self.pe[:, :x.size(2)].repeat(
|
||||||
|
x.shape[0], 1, 1) / self.C
|
||||||
|
|
||||||
def init_pe_matrix(self, n_channels, max_len, x):
|
def init_pe_matrix(self, n_channels, max_len, x):
|
||||||
pe = torch.zeros(max_len, n_channels)
|
pe = torch.zeros(max_len, n_channels)
|
||||||
|
@ -172,4 +174,3 @@ class DBlock(nn.Module):
|
||||||
for idx, layer in enumerate(self.main_block):
|
for idx, layer in enumerate(self.main_block):
|
||||||
if len(layer.state_dict()) != 0:
|
if len(layer.state_dict()) != 0:
|
||||||
self.main_block[idx] = weight_norm(layer)
|
self.main_block[idx] = weight_norm(layer)
|
||||||
|
|
||||||
|
|
|
@ -79,7 +79,7 @@ class Wavegrad(nn.Module):
|
||||||
return x
|
return x
|
||||||
|
|
||||||
def load_noise_schedule(self, path):
|
def load_noise_schedule(self, path):
|
||||||
beta = np.load(path, allow_pickle=True).item()['beta']
|
beta = np.load(path, allow_pickle=True).item()['beta'] # pylint: disable=unexpected-keyword-arg
|
||||||
self.compute_noise_level(beta)
|
self.compute_noise_level(beta)
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
|
@ -91,8 +91,8 @@ class Wavegrad(nn.Module):
|
||||||
y_n = torch.FloatTensor(y_n).unsqueeze(0).unsqueeze(0).to(x)
|
y_n = torch.FloatTensor(y_n).unsqueeze(0).unsqueeze(0).to(x)
|
||||||
sqrt_alpha_hat = self.noise_level.to(x)
|
sqrt_alpha_hat = self.noise_level.to(x)
|
||||||
for n in range(len(self.alpha) - 1, -1, -1):
|
for n in range(len(self.alpha) - 1, -1, -1):
|
||||||
y_n = self.c1[n] * (y_n -
|
y_n = self.c1[n] * (y_n - self.c2[n] * self.forward(
|
||||||
self.c2[n] * self.forward(y_n, x, sqrt_alpha_hat[n].repeat(x.shape[0])))
|
y_n, x, sqrt_alpha_hat[n].repeat(x.shape[0])))
|
||||||
if n > 0:
|
if n > 0:
|
||||||
z = torch.randn_like(y_n)
|
z = torch.randn_like(y_n)
|
||||||
y_n += self.sigma[n - 1] * z
|
y_n += self.sigma[n - 1] * z
|
||||||
|
|
|
@ -118,9 +118,8 @@ class UpsampleNetwork(nn.Module):
|
||||||
|
|
||||||
|
|
||||||
class Upsample(nn.Module):
|
class Upsample(nn.Module):
|
||||||
def __init__(
|
def __init__(self, scale, pad, num_res_blocks, feat_dims, compute_dims,
|
||||||
self, scale, pad, num_res_blocks, feat_dims, compute_dims, res_out_dims, use_aux_net
|
res_out_dims, use_aux_net):
|
||||||
):
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.pad = pad
|
self.pad = pad
|
||||||
|
|
|
@ -44,9 +44,11 @@ def log_sum_exp(x):
|
||||||
|
|
||||||
|
|
||||||
# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
|
# It is adapted from https://github.com/r9y9/wavenet_vocoder/blob/master/wavenet_vocoder/mixture.py
|
||||||
def discretized_mix_logistic_loss(
|
def discretized_mix_logistic_loss(y_hat,
|
||||||
y_hat, y, num_classes=65536, log_scale_min=None, reduce=True
|
y,
|
||||||
):
|
num_classes=65536,
|
||||||
|
log_scale_min=None,
|
||||||
|
reduce=True):
|
||||||
if log_scale_min is None:
|
if log_scale_min is None:
|
||||||
log_scale_min = float(np.log(1e-14))
|
log_scale_min = float(np.log(1e-14))
|
||||||
y_hat = y_hat.permute(0, 2, 1)
|
y_hat = y_hat.permute(0, 2, 1)
|
||||||
|
|
|
@ -7,7 +7,7 @@ import pickle as pickle_tts
|
||||||
from TTS.utils.io import RenamingUnpickler
|
from TTS.utils.io import RenamingUnpickler
|
||||||
|
|
||||||
|
|
||||||
def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False):
|
def load_checkpoint(model, checkpoint_path, use_cuda=False, eval=False): # pylint: disable=redefined-builtin
|
||||||
try:
|
try:
|
||||||
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
|
||||||
except ModuleNotFoundError:
|
except ModuleNotFoundError:
|
||||||
|
|
|
@ -166,7 +166,7 @@ class SSIMLossTests(unittest.TestCase):
|
||||||
dummy_target = T.zeros(4, 8, 128).float()
|
dummy_target = T.zeros(4, 8, 128).float()
|
||||||
dummy_length = (T.ones(4) * 8).long()
|
dummy_length = (T.ones(4) * 8).long()
|
||||||
output = layer(dummy_input, dummy_target, dummy_length)
|
output = layer(dummy_input, dummy_target, dummy_length)
|
||||||
assert abs(output.item() - 1.0) < 1e-4 , "1.0 vs {}".format(output.item())
|
assert abs(output.item() - 1.0) < 1e-4, "1.0 vs {}".format(output.item())
|
||||||
|
|
||||||
# test if padded values of input makes any difference
|
# test if padded values of input makes any difference
|
||||||
dummy_input = T.ones(4, 8, 128).float()
|
dummy_input = T.ones(4, 8, 128).float()
|
||||||
|
@ -217,4 +217,3 @@ class SSIMLossTests(unittest.TestCase):
|
||||||
(sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
|
(sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2)
|
||||||
output = layer(dummy_input + mask, dummy_target, dummy_length)
|
output = layer(dummy_input + mask, dummy_target, dummy_length)
|
||||||
assert output.item() == 0, "0 vs {}".format(output.item())
|
assert output.item() == 0, "0 vs {}".format(output.item())
|
||||||
|
|
||||||
|
|
|
@ -161,7 +161,7 @@ def test_speedy_speech():
|
||||||
x_lengths,
|
x_lengths,
|
||||||
y_lengths,
|
y_lengths,
|
||||||
durations,
|
durations,
|
||||||
g=torch.rand((B,256)).to(device))
|
g=torch.rand((B, 256)).to(device))
|
||||||
|
|
||||||
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
|
assert list(o_de.shape) == [B, 80, T_de], f"{list(o_de.shape)}"
|
||||||
assert list(attn.shape) == [B, T_de, T_en]
|
assert list(attn.shape) == [B, T_de, T_en]
|
||||||
|
|
|
@ -356,4 +356,3 @@ class SCGSTMultiSpeakeTacotronTrainTest(unittest.TestCase):
|
||||||
), "param {} with shape {} not updated!! \n{}\n{}".format(
|
), "param {} with shape {} not updated!! \n{}\n{}".format(
|
||||||
count, param.shape, param, param_ref)
|
count, param.shape, param, param_ref)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
|
|
|
@ -17,5 +17,5 @@ def test_currency() -> None:
|
||||||
|
|
||||||
|
|
||||||
def test_expand_numbers() -> None:
|
def test_expand_numbers() -> None:
|
||||||
assert "minus one" == phoneme_cleaners("-1")
|
assert phoneme_cleaners("-1") == 'minus one'
|
||||||
assert "one" == phoneme_cleaners("1")
|
assert phoneme_cleaners("1") == 'one'
|
||||||
|
|
|
@ -17,7 +17,7 @@ def test_phoneme_to_sequence():
|
||||||
lang = "en-us"
|
lang = "en-us"
|
||||||
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
sequence = phoneme_to_sequence(text, text_cleaner, lang)
|
||||||
text_hat = sequence_to_phoneme(sequence)
|
text_hat = sequence_to_phoneme(sequence)
|
||||||
sequence_with_params = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
_ = phoneme_to_sequence(text, text_cleaner, lang, tp=conf.characters)
|
||||||
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
text_hat_with_params = sequence_to_phoneme(sequence, tp=conf.characters)
|
||||||
gt = 'ɹiːsənt ɹᵻsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪŋkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹᵻspɑːnsᵻbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjʊleɪʃən ænd lɜːnɪŋ!'
|
gt = 'ɹiːsənt ɹᵻsɜːtʃ æt hɑːɹvɚd hɐz ʃoʊn mɛdᵻteɪɾɪŋ fɔːɹ æz lɪɾəl æz eɪt wiːks kæn æktʃuːəli ɪŋkɹiːs, ðə ɡɹeɪ mæɾɚɹ ɪnðə pɑːɹts ʌvðə bɹeɪn ɹᵻspɑːnsᵻbəl fɔːɹ ɪmoʊʃənəl ɹɛɡjʊleɪʃən ænd lɜːnɪŋ!'
|
||||||
assert text_hat == text_hat_with_params == gt
|
assert text_hat == text_hat_with_params == gt
|
||||||
|
|
Loading…
Reference in New Issue