From 11e789532905beb702b492432dc72b136a37587e Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Fri, 19 Jul 2019 08:46:23 +0200 Subject: [PATCH] Fix Pylint issues --- dataset_analysis/analyze.py | 22 +++---- datasets/TTSDataset.py | 29 +++++---- datasets/preprocess.py | 6 +- debug_config.py => debug_config.json | 0 distribute.py | 53 +++++++--------- layers/common_layers.py | 7 ++- layers/custom_layers.py | 4 +- layers/gst_layers.py | 7 ++- layers/losses.py | 26 +++----- layers/tacotron.py | 26 ++++---- layers/tacotron2.py | 19 +++--- models/tacotron.py | 4 +- models/tacotron2.py | 6 +- models/tacotrongst.py | 4 +- server/server.py | 2 +- server/synthesizer.py | 93 ++++++++++++++-------------- setup.py | 3 - test_cluster.py | 2 +- tests/generic_utils_text.py | 14 ++--- tests/test_audio.py | 2 - tests/test_layers.py | 3 +- tests/test_loader.py | 3 +- tests/test_tacotron2_model.py | 2 +- tests/test_tacotron_model.py | 7 +-- tests/test_text_processing.py | 1 - train.py | 58 +++++++++-------- utils/audio.py | 47 +++++++------- utils/data.py | 1 - utils/generic_utils.py | 7 +-- utils/logger.py | 7 +-- utils/synthesis.py | 16 ++--- utils/text/__init__.py | 50 +++++++-------- utils/text/cmudict.py | 34 +++++----- utils/text/number_norm.py | 14 ++--- utils/visual.py | 7 +-- 35 files changed, 270 insertions(+), 316 deletions(-) rename debug_config.py => debug_config.json (100%) diff --git a/dataset_analysis/analyze.py b/dataset_analysis/analyze.py index 5fdf3973..f34605dd 100644 --- a/dataset_analysis/analyze.py +++ b/dataset_analysis/analyze.py @@ -1,12 +1,10 @@ -# visualisation tools for mimic2 +# visualisation tools for mimic2 import matplotlib.pyplot as plt from statistics import stdev, mode, mean, median from statistics import StatisticsError import argparse -import glob import os import csv -import copy import seaborn as sns import random from text.cmudict import CMUDict @@ -32,7 +30,7 @@ def append_data_statistics(meta_data): std = stdev( d["audio_len"] for d in data ) - except: + except StatisticsError: std = 0 meta_data[char_cnt]["mean"] = mean_audio_len @@ -114,7 +112,7 @@ def plot(meta_data, save_path=None): y_mode = graph_data['y_mode'] y_median = graph_data['y_median'] y_num_samples = graph_data['y_num_samples'] - + plt.figure() plt.plot(x, y_avg, 'ro') plt.xlabel("character lengths", fontsize=30) @@ -122,7 +120,7 @@ def plot(meta_data, save_path=None): if save: name = "char_len_vs_avg_secs" plt.savefig(os.path.join(save_path, name)) - + plt.figure() plt.plot(x, y_mode, 'ro') plt.xlabel("character lengths", fontsize=30) @@ -182,12 +180,12 @@ def plot_phonemes(train_path, cmu_dict_path, save_path): for key in phonemes: x.append(key) y.append(phonemes[key]) - + plt.figure() plt.rcParams["figure.figsize"] = (50, 20) - plot = sns.barplot(x, y) + barplot = sns.barplot(x, y) if save_path: - fig = plot.get_figure() + fig = barplot.get_figure() fig.savefig(os.path.join(save_path, "phoneme_dist")) @@ -201,7 +199,7 @@ def main(): '--save_to', help='path to save charts of data to' ) parser.add_argument( - '--cmu_dict_path', help='give cmudict-0.7b to see phoneme distribution' + '--cmu_dict_path', help='give cmudict-0.7b to see phoneme distribution' ) args = parser.parse_args() meta_data = process_meta_data(args.train_file_path) @@ -210,8 +208,8 @@ def main(): if args.cmu_dict_path: plt.rcParams["figure.figsize"] = (30, 10) plot_phonemes(args.train_file_path, args.cmu_dict_path, args.save_to) - + plt.show() if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index 9cf366c1..ecf22f1e 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -1,14 +1,12 @@ import os import numpy as np import collections -import librosa import torch import random from torch.utils.data import Dataset from utils.text import text_to_sequence, phoneme_to_sequence -from utils.data import (prepare_data, pad_per_step, prepare_tensor, - prepare_stop_target) +from utils.data import prepare_data, prepare_tensor, prepare_stop_target class MyDataset(Dataset): @@ -35,14 +33,14 @@ class MyDataset(Dataset): meta_data (list): list of dataset instances. speaker_id_cache_path (str): path where the speaker name to id mapping is stored - batch_group_size (int): (0) range of batch randomization after sorting - sequences by length. - min_seq_len (int): (0) minimum sequence length to be processed + batch_group_size (int): (0) range of batch randomization after sorting + sequences by length. + min_seq_len (int): (0) minimum sequence length to be processed by the loader. max_seq_len (int): (float("inf")) maximum sequence length. use_phonemes (bool): (true) if true, text converted to phonemes. - phoneme_cache_path (str): path to cache phoneme features. - phoneme_language (str): one the languages from + phoneme_cache_path (str): path to cache phoneme features. + phoneme_language (str): one the languages from https://github.com/bootphon/phonemizer#languages enable_eos_bos (bool): enable end of sentence and beginning of sentences characters. verbose (bool): print diagnostic information. @@ -76,7 +74,8 @@ class MyDataset(Dataset): audio = self.ap.load_wav(filename) return audio - def load_np(self, filename): + @staticmethod + def load_np(filename): data = np.load(filename).astype('float32') return data @@ -87,7 +86,7 @@ class MyDataset(Dataset): if os.path.isfile(tmp_path): try: text = np.load(tmp_path) - except: + except (IOError, ValueError): print(" > ERROR: phoneme connot be loaded for {}. Recomputing.".format(wav_file)) text = np.asarray( phoneme_to_sequence( @@ -126,7 +125,7 @@ class MyDataset(Dataset): def sort_items(self): r"""Sort instances based on text length in ascending order""" lengths = np.array([len(ins[0]) for ins in self.items]) - + idxs = np.argsort(lengths) new_items = [] ignored = [] @@ -150,10 +149,10 @@ class MyDataset(Dataset): print(" | > Max length sequence: {}".format(np.max(lengths))) print(" | > Min length sequence: {}".format(np.min(lengths))) print(" | > Avg length sequence: {}".format(np.mean(lengths))) - print(" | > Num. instances discarded by max-min seq limits: {}".format( - len(ignored), self.min_seq_len)) + print(" | > Num. instances discarded by max-min (max={}, min={}) seq limits: {}".format( + self.max_seq_len, self.min_seq_len, len(ignored))) print(" | > Batch group size: {}.".format(self.batch_group_size)) - + def __len__(self): return len(self.items) @@ -182,7 +181,7 @@ class MyDataset(Dataset): ] text = [batch[idx]['text'] for idx in ids_sorted_decreasing] speaker_name = [batch[idx]['speaker_name'] - for idx in ids_sorted_decreasing] + for idx in ids_sorted_decreasing] mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] linear = [self.ap.spectrogram(w).astype('float32') for w in wav] diff --git a/datasets/preprocess.py b/datasets/preprocess.py index 2862a3e1..bf1c8d97 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -11,7 +11,7 @@ def get_preprocessor_by_name(name): def tweb(root_path, meta_file): - """Normalize TWEB dataset. + """Normalize TWEB dataset. https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset """ txt_file = os.path.join(root_path, meta_file) @@ -123,9 +123,9 @@ def nancy(root_path, meta_file): speaker_name = "nancy" with open(txt_file, 'r') as ttf: for line in ttf: - id = line.split()[1] + utt_id = line.split()[1] text = line[line.find('"') + 1:line.rfind('"') - 1] - wav_file = os.path.join(root_path, "wavn", id + ".wav") + wav_file = os.path.join(root_path, "wavn", utt_id + ".wav") items.append([text, wav_file, speaker_name]) return items diff --git a/debug_config.py b/debug_config.json similarity index 100% rename from debug_config.py rename to debug_config.json diff --git a/distribute.py b/distribute.py index c2f786fe..22c27b1c 100644 --- a/distribute.py +++ b/distribute.py @@ -1,6 +1,5 @@ # edited from https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/distributed.py import os -import sys import math import time import subprocess @@ -19,6 +18,7 @@ class DistributedSampler(Sampler): """ def __init__(self, dataset, num_replicas=None, rank=None): + super(DistributedSampler, self).__init__(dataset) if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") @@ -54,12 +54,6 @@ class DistributedSampler(Sampler): self.epoch = epoch -def reduce_tensor(tensor, n_gpus): - rt = tensor.clone() - dist.all_reduce(rt, op=dist.reduce_op.SUM) - rt /= n_gpus - return rt - def reduce_tensor(tensor, num_gpus): rt = tensor.clone() dist.all_reduce(rt, op=dist.reduce_op.SUM) @@ -91,7 +85,7 @@ def apply_gradient_allreduce(module): dist.broadcast(p, 0) def allreduce_params(): - if (module.needs_reduction): + if module.needs_reduction: module.needs_reduction = False # bucketing params based on value types buckets = {} @@ -113,23 +107,39 @@ def apply_gradient_allreduce(module): for param in list(module.parameters()): - def allreduce_hook(*unused): + def allreduce_hook(*_): Variable._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook) - def set_needs_reduction(self, input, output): + def set_needs_reduction(self, *_): self.needs_reduction = True module.register_forward_hook(set_needs_reduction) return module -def main(args): +def main(): """ Call train.py as a new process and pass command arguments """ + parser = argparse.ArgumentParser() + parser.add_argument( + '--restore_path', + type=str, + help='Folder path to checkpoints', + default='') + parser.add_argument( + '--config_path', + type=str, + help='path to config file for training', + ) + parser.add_argument( + '--data_path', type=str, help='dataset path.', default='') + + args = parser.parse_args() + CONFIG = load_config(args.config_path) OUT_PATH = create_experiment_folder(CONFIG.output_path, CONFIG.run_name, True) @@ -150,7 +160,7 @@ def main(args): if not os.path.isdir(stdout_path): os.makedirs(stdout_path) os.chmod(stdout_path, 0o775) - + # run processes processes = [] for i in range(num_gpus): @@ -159,7 +169,7 @@ def main(args): command[6] = '--rank={}'.format(i) stdout = None if i == 0 else open( os.path.join(stdout_path, "process_{}.log".format(i)), "w") - p = subprocess.Popen(['python3'.format(i)] + command, stdout=stdout, env=my_env) + p = subprocess.Popen(['python3'] + command, stdout=stdout, env=my_env) processes.append(p) print(command) @@ -168,19 +178,4 @@ def main(args): if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--restore_path', - type=str, - help='Folder path to checkpoints', - default='') - parser.add_argument( - '--config_path', - type=str, - help='path to config file for training', - ) - parser.add_argument( - '--data_path', type=str, help='dataset path.', default='') - - args = parser.parse_args() - main(args) + main() diff --git a/layers/common_layers.py b/layers/common_layers.py index c84b04b9..2edf0dab 100644 --- a/layers/common_layers.py +++ b/layers/common_layers.py @@ -1,7 +1,6 @@ -from math import sqrt import torch -from torch.autograd import Variable from torch import nn +from torch.autograd import Variable from torch.nn import functional as F @@ -107,6 +106,8 @@ class LocationLayer(nn.Module): class Attention(nn.Module): + # Pylint gets confused by PyTorch conventions here + #pylint: disable=attribute-defined-outside-init def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, location_attention, attention_location_n_filters, attention_location_kernel_size, windowing, norm, forward_attn, @@ -262,4 +263,4 @@ class Attention(nn.Module): context = torch.bmm(alignment.unsqueeze(1), inputs) context = context.squeeze(1) self.attention_weights = alignment - return context \ No newline at end of file + return context diff --git a/layers/custom_layers.py b/layers/custom_layers.py index e1fde912..72668c97 100644 --- a/layers/custom_layers.py +++ b/layers/custom_layers.py @@ -1,6 +1,6 @@ # coding: utf-8 -import torch -from torch import nn +# import torch +# from torch import nn # class StopProjection(nn.Module): # r""" Simple projection layer to predict the "stop token" diff --git a/layers/gst_layers.py b/layers/gst_layers.py index 647712d7..8058d5ed 100644 --- a/layers/gst_layers.py +++ b/layers/gst_layers.py @@ -77,10 +77,11 @@ class ReferenceEncoder(nn.Module): return out.squeeze(0) - def calculate_post_conv_height(self, height, kernel_size, stride, pad, + @staticmethod + def calculate_post_conv_height(height, kernel_size, stride, pad, n_convs): """Height of spec after n convolutions with fixed kernel/stride/pad.""" - for i in range(n_convs): + for _ in range(n_convs): height = (height - kernel_size + 2 * pad) // stride + 1 return height @@ -165,4 +166,4 @@ class MultiHeadAttention(nn.Module): torch.split(out, 1, dim=0), dim=3).squeeze(0) # [N, T_q, num_units] - return out \ No newline at end of file + return out diff --git a/layers/losses.py b/layers/losses.py index 0597bdf9..5a95c0fe 100644 --- a/layers/losses.py +++ b/layers/losses.py @@ -1,17 +1,13 @@ -import torch -from torch.nn import functional from torch import nn +from torch.nn import functional from utils.generic_utils import sequence_mask class L1LossMasked(nn.Module): - def __init__(self): - super(L1LossMasked, self).__init__() - - def forward(self, input, target, length): + def forward(self, x, target, length): """ Args: - input: A Variable containing a FloatTensor of size + x: A Variable containing a FloatTensor of size (batch, max_len, dim) which contains the unnormalized probability for each class. target: A Variable containing a LongTensor of size @@ -26,21 +22,18 @@ class L1LossMasked(nn.Module): target.requires_grad = False mask = sequence_mask( sequence_length=length, max_len=target.size(1)).unsqueeze(2).float() - mask = mask.expand_as(input) + mask = mask.expand_as(x) loss = functional.l1_loss( - input * mask, target * mask, reduction="sum") + x * mask, target * mask, reduction="sum") loss = loss / mask.sum() return loss class MSELossMasked(nn.Module): - def __init__(self): - super(MSELossMasked, self).__init__() - - def forward(self, input, target, length): + def forward(self, x, target, length): """ Args: - input: A Variable containing a FloatTensor of size + x: A Variable containing a FloatTensor of size (batch, max_len, dim) which contains the unnormalized probability for each class. target: A Variable containing a LongTensor of size @@ -55,9 +48,8 @@ class MSELossMasked(nn.Module): target.requires_grad = False mask = sequence_mask( sequence_length=length, max_len=target.size(1)).unsqueeze(2).float() - mask = mask.expand_as(input) + mask = mask.expand_as(x) loss = functional.mse_loss( - input * mask, target * mask, reduction="sum") + x * mask, target * mask, reduction="sum") loss = loss / mask.sum() return loss - diff --git a/layers/tacotron.py b/layers/tacotron.py index 424f8479..b71ddbc3 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -177,7 +177,7 @@ class CBHG(nn.Module): # (B, in_features, T_in) if x.size(-1) == self.in_features: x = x.transpose(1, 2) - T = x.size(-1) + # T = x.size(-1) # (B, hid_features*K, T_in) # Concat conv1d bank outputs outs = [] @@ -261,7 +261,7 @@ class PostCBHG(nn.Module): class Decoder(nn.Module): - r"""Decoder module. + """Decoder module. Args: in_features (int): input vector (encoder output) sample size. @@ -270,6 +270,8 @@ class Decoder(nn.Module): memory_size (int): size of the past window. if <= 0 memory_size = r TODO: arguments """ + # Pylint gets confused by PyTorch conventions here + #pylint: disable=attribute-defined-outside-init def __init__(self, in_features, memory_dim, r, memory_size, attn_windowing, attn_norm, prenet_type, prenet_dropout, forward_attn, @@ -290,16 +292,16 @@ class Decoder(nn.Module): # processed_inputs, processed_memory -> |Attention| -> Attention, attention, RNN_State self.attention_rnn = nn.GRUCell(in_features + 128, 256) self.attention_layer = Attention(attention_rnn_dim=256, - embedding_dim=in_features, - attention_dim=128, - location_attention=location_attn, - attention_location_n_filters=32, - attention_location_kernel_size=31, - windowing=attn_windowing, - norm=attn_norm, - forward_attn=forward_attn, - trans_agent=trans_agent, - forward_attn_mask=forward_attn_mask) + embedding_dim=in_features, + attention_dim=128, + location_attention=location_attn, + attention_location_n_filters=32, + attention_location_kernel_size=31, + windowing=attn_windowing, + norm=attn_norm, + forward_attn=forward_attn, + trans_agent=trans_agent, + forward_attn_mask=forward_attn_mask) # (processed_memory | attention context) -> |Linear| -> decoder_RNN_input self.project_to_decoder_in = nn.Linear(256 + in_features, 256) # decoder_RNN_input -> |RNN| -> RNN_state diff --git a/layers/tacotron2.py b/layers/tacotron2.py index 09bf5373..802f158e 100644 --- a/layers/tacotron2.py +++ b/layers/tacotron2.py @@ -1,9 +1,8 @@ -from math import sqrt import torch from torch.autograd import Variable from torch import nn from torch.nn import functional as F -from .common_layers import Attention, Prenet, Linear, LinearBN +from .common_layers import Attention, Prenet, Linear class ConvBNBlock(nn.Module): @@ -33,7 +32,7 @@ class Postnet(nn.Module): self.convolutions = nn.ModuleList() self.convolutions.append( ConvBNBlock(mel_dim, 512, kernel_size=5, nonlinear='tanh')) - for i in range(1, num_convs - 1): + for _ in range(1, num_convs - 1): self.convolutions.append( ConvBNBlock(512, 512, kernel_size=5, nonlinear='tanh')) self.convolutions.append( @@ -95,6 +94,8 @@ class Encoder(nn.Module): # adapted from https://github.com/NVIDIA/tacotron2/ class Decoder(nn.Module): + # Pylint gets confused by PyTorch conventions here + #pylint: disable=attribute-defined-outside-init def __init__(self, in_features, inputs_dim, r, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, separate_stopnet): @@ -118,15 +119,15 @@ class Decoder(nn.Module): self.attention_rnn = nn.LSTMCell(self.prenet_dim + in_features, self.attention_rnn_dim) - self.attention_layer = Attention(attention_rnn_dim=self.attention_rnn_dim, + self.attention_layer = Attention(attention_rnn_dim=self.attention_rnn_dim, embedding_dim=in_features, - attention_dim=128, - location_attention=location_attn, + attention_dim=128, + location_attention=location_attn, attention_location_n_filters=32, attention_location_kernel_size=31, windowing=attn_win, - norm=attn_norm, - forward_attn=forward_attn, + norm=attn_norm, + forward_attn=forward_attn, trans_agent=trans_agent, forward_attn_mask=forward_attn_mask) @@ -156,7 +157,7 @@ class Decoder(nn.Module): def _init_states(self, inputs, mask, keep_states=False): B = inputs.size(0) - T = inputs.size(1) + # T = inputs.size(1) if not keep_states: self.attention_hidden = self.attention_rnn_init( diff --git a/models/tacotron.py b/models/tacotron.py index 2e59726b..bb8ebb7a 100644 --- a/models/tacotron.py +++ b/models/tacotron.py @@ -1,8 +1,6 @@ # coding: utf-8 -import torch from torch import nn -from math import sqrt -from layers.tacotron import Prenet, Encoder, Decoder, PostCBHG +from layers.tacotron import Encoder, Decoder, PostCBHG from utils.generic_utils import sequence_mask diff --git a/models/tacotron2.py b/models/tacotron2.py index e4f7abb0..526a3fac 100644 --- a/models/tacotron2.py +++ b/models/tacotron2.py @@ -1,8 +1,5 @@ from math import sqrt -import torch -from torch.autograd import Variable from torch import nn -from torch.nn import functional as F from layers.tacotron2 import Encoder, Decoder, Postnet from utils.generic_utils import sequence_mask @@ -39,7 +36,8 @@ class Tacotron2(nn.Module): location_attn, separate_stopnet) self.postnet = Postnet(self.n_mel_channels) - def shape_outputs(self, mel_outputs, mel_outputs_postnet, alignments): + @staticmethod + def shape_outputs(mel_outputs, mel_outputs_postnet, alignments): mel_outputs = mel_outputs.transpose(1, 2) mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2) return mel_outputs, mel_outputs_postnet, alignments diff --git a/models/tacotrongst.py b/models/tacotrongst.py index 14261d90..5b372338 100644 --- a/models/tacotrongst.py +++ b/models/tacotrongst.py @@ -1,8 +1,6 @@ # coding: utf-8 -import torch from torch import nn -from math import sqrt -from layers.tacotron import Prenet, Encoder, Decoder, PostCBHG +from layers.tacotron import Encoder, Decoder, PostCBHG from layers.gst_layers import GST from utils.generic_utils import sequence_mask diff --git a/server/server.py b/server/server.py index d7b1dca8..95fa1caf 100644 --- a/server/server.py +++ b/server/server.py @@ -2,7 +2,7 @@ import argparse from synthesizer import Synthesizer from utils.generic_utils import load_config -from flask import Flask, Response, request, render_template, send_file +from flask import Flask, request, render_template, send_file parser = argparse.ArgumentParser() parser.add_argument( diff --git a/server/synthesizer.py b/server/synthesizer.py index a7b78e76..29895b73 100644 --- a/server/synthesizer.py +++ b/server/synthesizer.py @@ -5,31 +5,23 @@ import numpy as np import torch import sys -import numpy as np -import torch - -from models.tacotron import Tacotron from utils.audio import AudioProcessor from utils.generic_utils import load_config, setup_model from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence, sequence_to_phoneme import re -alphabets= "([A-Za-z])" -prefixes = "(Mr|St|Mrs|Ms|Dr)[.]" -suffixes = "(Inc|Ltd|Jr|Sr|Co)" -starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" -acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" -websites = "[.](com|net|org|io|gov)" +alphabets = r"([A-Za-z])" +prefixes = r"(Mr|St|Mrs|Ms|Dr)[.]" +suffixes = r"(Inc|Ltd|Jr|Sr|Co)" +starters = r"(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" +acronyms = r"([A-Z][.][A-Z][.](?:[A-Z][.])?)" +websites = r"[.](com|net|org|io|gov)" -from models.tacotron import Tacotron -from utils.audio import AudioProcessor -from utils.generic_utils import load_config -from utils.text import phoneme_to_sequence, phonemes, symbols, text_to_sequence class Synthesizer(object): def __init__(self, config): self.wavernn = None - self.config = config + self.config = config self.use_cuda = config.use_cuda if self.use_cuda: assert torch.cuda.is_available(), "CUDA is not availabe on this machine." @@ -52,7 +44,7 @@ class Synthesizer(object): else: self.input_size = len(symbols) self.input_adapter = lambda sen: text_to_sequence(sen, [self.tts_config.text_cleaner]) - self.tts_model = setup_model(self.input_size, self.tts_config) + self.tts_model = setup_model(self.input_size, c=self.tts_config) #FIXME: missing num_speakers argument to setup_model # load model state if use_cuda: cp = torch.load(self.model_file) @@ -75,18 +67,18 @@ class Synthesizer(object): print(" | > model file: ", model_file) self.wavernn_config = load_config(wavernn_config) self.wavernn = Model( - rnn_dims=512, - fc_dims=512, - mode=self.wavernn_config.mode, - pad=2, - upsample_factors=self.wavernn_config.upsample_factors, # set this depending on dataset - feat_dims=80, - compute_dims=128, - res_out_dims=128, - res_blocks=10, - hop_length=self.ap.hop_length, - sample_rate=self.ap.sample_rate, - ).cuda() + rnn_dims=512, + fc_dims=512, + mode=self.wavernn_config.mode, + pad=2, + upsample_factors=self.wavernn_config.upsample_factors, # set this depending on dataset + feat_dims=80, + compute_dims=128, + res_out_dims=128, + res_blocks=10, + hop_length=self.ap.hop_length, + sample_rate=self.ap.sample_rate, + ).cuda() check = torch.load(model_file) self.wavernn.load_state_dict(check['model']) @@ -101,25 +93,30 @@ class Synthesizer(object): def split_into_sentences(self, text): text = " " + text + " " - text = text.replace("\n"," ") - text = re.sub(prefixes,"\\1",text) - text = re.sub(websites,"\\1",text) - if "Ph.D" in text: text = text.replace("Ph.D.","PhD") - text = re.sub("\s" + alphabets + "[.] "," \\1 ",text) - text = re.sub(acronyms+" "+starters,"\\1 \\2",text) - text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1\\2\\3",text) - text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1\\2",text) - text = re.sub(" "+suffixes+"[.] "+starters," \\1 \\2",text) - text = re.sub(" "+suffixes+"[.]"," \\1",text) - text = re.sub(" " + alphabets + "[.]"," \\1",text) - if "”" in text: text = text.replace(".”","”.") - if "\"" in text: text = text.replace(".\"","\".") - if "!" in text: text = text.replace("!\"","\"!") - if "?" in text: text = text.replace("?\"","\"?") - text = text.replace(".",".") - text = text.replace("?","?") - text = text.replace("!","!") - text = text.replace("",".") + text = text.replace("\n", " ") + text = re.sub(prefixes, "\\1", text) + text = re.sub(websites, "\\1", text) + if "Ph.D" in text: + text = text.replace("Ph.D.", "PhD") + text = re.sub(r"\s" + alphabets + "[.] ", " \\1 ", text) + text = re.sub(acronyms+" "+starters, "\\1 \\2", text) + text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1\\2\\3", text) + text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1\\2", text) + text = re.sub(" "+suffixes+"[.] "+starters, " \\1 \\2", text) + text = re.sub(" "+suffixes+"[.]", " \\1", text) + text = re.sub(" " + alphabets + "[.]", " \\1", text) + if "”" in text: + text = text.replace(".”", "”.") + if "\"" in text: + text = text.replace(".\"", "\".") + if "!" in text: + text = text.replace("!\"", "\"!") + if "?" in text: + text = text.replace("?\"", "\"?") + text = text.replace(".", ".") + text = text.replace("?", "?") + text = text.replace("!", "!") + text = text.replace("", ".") sentences = text.split("") sentences = sentences[:-1] sentences = [s.strip() for s in sentences] @@ -128,7 +125,7 @@ class Synthesizer(object): def tts(self, text): wavs = [] sens = self.split_into_sentences(text) - if len(sens) == 0: + if not sens: sens = [text+'.'] for sen in sens: if len(sen) < 3: diff --git a/setup.py b/setup.py index d55e0c59..b1c4c7ac 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,6 @@ import setuptools.command.develop import setuptools.command.build_py import os import subprocess -from os.path import exists version = '0.0.1' @@ -31,7 +30,6 @@ class build_py(setuptools.command.build_py.build_py): @staticmethod def create_version_file(): - global version, cwd print('-- Building version ' + version) version_path = os.path.join(cwd, 'version.py') with open(version_path, 'w') as f: @@ -45,7 +43,6 @@ class develop(setuptools.command.develop.develop): def create_readme_rst(): - global cwd try: subprocess.check_call( [ diff --git a/test_cluster.py b/test_cluster.py index 7f3cf221..daeeedc3 100644 --- a/test_cluster.py +++ b/test_cluster.py @@ -1 +1 @@ -print("Python is running!!") \ No newline at end of file +print("Python is running!!") diff --git a/tests/generic_utils_text.py b/tests/generic_utils_text.py index 56b794ce..2ef39c09 100644 --- a/tests/generic_utils_text.py +++ b/tests/generic_utils_text.py @@ -2,7 +2,7 @@ import unittest import torch as T from utils.generic_utils import save_checkpoint, save_best_model -from layers.tacotron import Prenet, CBHG, Decoder, Encoder +from layers.tacotron import Prenet OUT_PATH = '/tmp/test.pth.tar' @@ -11,14 +11,14 @@ class ModelSavingTests(unittest.TestCase): def save_checkpoint_test(self): # create a dummy model model = Prenet(128, out_features=[256, 128]) - model = T.nn.DataParallel(layer) + model = T.nn.DataParallel(layer) #FIXME: undefined variable layer # save the model - save_checkpoint(model, None, 100, OUTPATH, 1, 1) + save_checkpoint(model, None, 100, OUT_PATH, 1, 1) # load the model to CPU - model_dict = torch.load( - MODEL_PATH, map_location=lambda storage, loc: storage) + model_dict = T.load( + MODEL_PATH, map_location=lambda storage, loc: storage) #FIXME: undefined variable MODEL_PATH model.load_state_dict(model_dict['model']) def save_best_model_test(self): @@ -27,9 +27,9 @@ class ModelSavingTests(unittest.TestCase): model = T.nn.DataParallel(layer) # save the model - best_loss = save_best_model(model, None, 0, 100, OUT_PATH, 10, 1) + save_best_model(model, None, 0, 100, OUT_PATH, 10, 1) # load the model to CPU - model_dict = torch.load( + model_dict = T.load( MODEL_PATH, map_location=lambda storage, loc: storage) model.load_state_dict(model_dict['model']) diff --git a/tests/test_audio.py b/tests/test_audio.py index 4021a284..b2c4a135 100644 --- a/tests/test_audio.py +++ b/tests/test_audio.py @@ -1,7 +1,5 @@ import os import unittest -import numpy as np -import torch as T from tests import get_tests_path, get_tests_input_path, get_tests_output_path from utils.audio import AudioProcessor diff --git a/tests/test_layers.py b/tests/test_layers.py index ec9f07af..5f84509a 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -19,6 +19,7 @@ class PrenetTests(unittest.TestCase): class CBHGTests(unittest.TestCase): def test_in_out(self): + #pylint: disable=attribute-defined-outside-init layer = self.cbhg = CBHG( 128, K=8, @@ -38,7 +39,7 @@ class CBHGTests(unittest.TestCase): class DecoderTests(unittest.TestCase): def test_in_out(self): - layer = Decoder(in_features=256, memory_dim=80, r=2, memory_size=4, attn_windowing=False, attn_norm="sigmoid") + layer = Decoder(in_features=256, memory_dim=80, r=2, memory_size=4, attn_windowing=False, attn_norm="sigmoid") #FIXME: several missing required parameters for Decoder ctor dummy_input = T.rand(4, 8, 256) dummy_memory = T.rand(4, 2, 80) diff --git a/tests/test_loader.py b/tests/test_loader.py index 0830cdc9..682f5161 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -1,7 +1,6 @@ import os import unittest import shutil -import numpy as np from torch.utils.data import DataLoader from utils.generic_utils import load_config @@ -132,7 +131,7 @@ class TestTTSDataset(unittest.TestCase): self.ap.save_wav(wav, OUTPATH + '/mel_inv_dataloader.wav') shutil.copy(item_idx[0], OUTPATH + '/mel_target_dataloader.wav') - # check linear-spec + # check linear-spec linear_spec = linear_input[0].cpu().numpy() wav = self.ap.inv_spectrogram(linear_spec.T) self.ap.save_wav(wav, OUTPATH + '/linear_inv_dataloader.wav') diff --git a/tests/test_tacotron2_model.py b/tests/test_tacotron2_model.py index c2f212f9..c3020209 100644 --- a/tests/test_tacotron2_model.py +++ b/tests/test_tacotron2_model.py @@ -37,7 +37,7 @@ class TacotronTrainTest(unittest.TestCase): criterion = MSELossMasked().to(device) criterion_st = nn.BCEWithLogitsLoss().to(device) - model = Tacotron2(24, c.r).to(device) + model = Tacotron2(24, c.r).to(device) #FIXME: missing num_speakers parameter to Tacotron2 ctor model.train() model_ref = copy.deepcopy(model) count = 0 diff --git a/tests/test_tacotron_model.py b/tests/test_tacotron_model.py index e0580107..001e335a 100644 --- a/tests/test_tacotron_model.py +++ b/tests/test_tacotron_model.py @@ -2,7 +2,6 @@ import os import copy import torch import unittest -import numpy as np from torch import optim from torch import nn @@ -48,7 +47,7 @@ class TacotronTrainTest(unittest.TestCase): linear_dim=c.audio['num_freq'], mel_dim=c.audio['num_mels'], r=c.r, - memory_size=c.memory_size).to(device) + memory_size=c.memory_size).to(device) #FIXME: missing num_speakers parameter to Tacotron ctor model.train() print(" > Num parameters for Tacotron model:%s"%(count_parameters(model))) model_ref = copy.deepcopy(model) @@ -58,7 +57,7 @@ class TacotronTrainTest(unittest.TestCase): assert (param - param_ref).sum() == 0, param count += 1 optimizer = optim.Adam(model.parameters(), lr=c.lr) - for i in range(5): + for _ in range(5): mel_out, linear_out, align, stop_tokens = model.forward( input, input_lengths, mel_spec) optimizer.zero_grad() @@ -77,4 +76,4 @@ class TacotronTrainTest(unittest.TestCase): assert (param != param_ref).any( ), "param {} with shape {} not updated!! \n{}\n{}".format( count, param.shape, param, param_ref) - count += 1 \ No newline at end of file + count += 1 diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 6dedd943..62440e47 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -69,7 +69,6 @@ def test_phoneme_to_sequence(): def test_text2phone(): text = "Recent research at Harvard has shown meditating for as little as 8 weeks can actually increase, the grey matter in the parts of the brain responsible for emotional regulation and learning!" - text_cleaner = ["phoneme_cleaners"] gt = "ɹ|iː|s|ə|n|t| |ɹ|ɪ|s|ɜː|tʃ| |æ|t| |h|ɑːɹ|v|ɚ|d| |h|ɐ|z| |ʃ|oʊ|n| |m|ɛ|d|ᵻ|t|eɪ|ɾ|ɪ|ŋ| |f|ɔː|ɹ| |æ|z| |l|ɪ|ɾ|əl| |æ|z| |eɪ|t| |w|iː|k|s| |k|æ|n| |æ|k|tʃ|uː|əl|i|| |ɪ|n|k|ɹ|iː|s|,| |ð|ə| |ɡ|ɹ|eɪ| |m|æ|ɾ|ɚ|ɹ| |ɪ|n|ð|ə| |p|ɑːɹ|t|s| |ʌ|v|ð|ə| |b|ɹ|eɪ|n| |ɹ|ɪ|s|p|ɑː|n|s|ə|b|əl| |f|ɔː|ɹ| |ɪ|m|oʊ|ʃ|ə|n|əl| |ɹ|ɛ|ɡ|j|uː|l|eɪ|ʃ|ə|n||| |æ|n|d| |l|ɜː|n|ɪ|ŋ|!" lang = "en-us" phonemes = text2phone(text, lang) diff --git a/train.py b/train.py index cb792700..6252c993 100644 --- a/train.py +++ b/train.py @@ -7,7 +7,6 @@ import traceback import numpy as np import torch import torch.nn as nn -from tensorboardX import SummaryWriter from torch import optim from torch.utils.data import DataLoader @@ -18,9 +17,8 @@ from layers.losses import L1LossMasked, MSELossMasked from utils.audio import AudioProcessor from utils.generic_utils import (NoamLR, check_update, count_parameters, create_experiment_folder, get_git_branch, - load_config, lr_decay, - remove_experiment_folder, save_best_model, - save_checkpoint, sequence_mask, weight_decay, + load_config, remove_experiment_folder, + save_best_model, save_checkpoint, weight_decay, set_init_dict, copy_config_file, setup_model, split_dataset) from utils.logger import Logger @@ -87,7 +85,7 @@ def setup_loader(is_val=False, verbose=False): def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, ap, epoch): - data_loader = setup_loader(is_val=False, verbose=(epoch==0)) + data_loader = setup_loader(is_val=False, verbose=(epoch == 0)) if c.use_speaker_embedding: speaker_mapping = load_speaker_mapping(OUT_PATH) model.train() @@ -131,7 +129,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, if c.lr_decay: scheduler.step() optimizer.zero_grad() - if optimizer_st: optimizer_st.zero_grad(); + if optimizer_st: + optimizer_st.zero_grad() # dispatch data to GPU if use_cuda: @@ -146,7 +145,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, # forward pass model decoder_output, postnet_output, alignments, stop_tokens = model( - text_input, text_lengths, mel_input, speaker_ids=speaker_ids) + text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # loss computation stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1) @@ -203,16 +202,16 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, if args.rank == 0: avg_postnet_loss += float(postnet_loss.item()) avg_decoder_loss += float(decoder_loss.item()) - avg_stop_loss += stop_loss if type(stop_loss) is float else float(stop_loss.item()) + avg_stop_loss += stop_loss if isinstance(stop_loss, float) else float(stop_loss.item()) avg_step_time += step_time # Plot Training Iter Stats iter_stats = {"loss_posnet": postnet_loss.item(), - "loss_decoder": decoder_loss.item(), - "lr": current_lr, - "grad_norm": grad_norm, - "grad_norm_st": grad_norm_st, - "step_time": step_time} + "loss_decoder": decoder_loss.item(), + "lr": current_lr, + "grad_norm": grad_norm, + "grad_norm_st": grad_norm_st, + "step_time": step_time} tb_logger.tb_train_iter_stats(current_step, iter_stats) if current_step % c.save_step == 0: @@ -224,7 +223,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, # Diagnostic visualizations const_spec = postnet_output[0].data.cpu().numpy() - gt_spec = linear_input[0].data.cpu().numpy() if c.model in ["Tacotron", "TacotronGST"] else mel_input[0].data.cpu().numpy() + gt_spec = linear_input[0].data.cpu().numpy() if c.model in ["Tacotron", "TacotronGST"] else mel_input[0].data.cpu().numpy() align_img = alignments[0].data.cpu().numpy() figures = { @@ -239,9 +238,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, train_audio = ap.inv_spectrogram(const_spec.T) else: train_audio = ap.inv_mel_spectrogram(const_spec.T) - tb_logger.tb_train_audios(current_step, - {'TrainAudio': train_audio}, - c.audio["sample_rate"]) + tb_logger.tb_train_audios(current_step, + {'TrainAudio': train_audio}, + c.audio["sample_rate"]) avg_postnet_loss /= (num_iter + 1) avg_decoder_loss /= (num_iter + 1) @@ -263,9 +262,9 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler, if args.rank == 0: # Plot Training Epoch Stats epoch_stats = {"loss_postnet": avg_postnet_loss, - "loss_decoder": avg_decoder_loss, - "stop_loss": avg_stop_loss, - "epoch_time": epoch_time} + "loss_decoder": avg_decoder_loss, + "stop_loss": avg_stop_loss, + "epoch_time": epoch_time} tb_logger.tb_train_epoch_stats(current_step, epoch_stats) if c.tb_model_param_stats: tb_logger.tb_model_weights(model, current_step) @@ -402,8 +401,8 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch): # Plot Validation Stats epoch_stats = {"loss_postnet": avg_postnet_loss, - "loss_decoder": avg_decoder_loss, - "stop_loss": avg_stop_loss} + "loss_decoder": avg_decoder_loss, + "stop_loss": avg_stop_loss} tb_logger.tb_eval_stats(current_step, epoch_stats) if args.rank == 0 and epoch > c.test_delay_epochs: @@ -420,7 +419,7 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch): file_path = os.path.join(AUDIO_PATH, str(current_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, - "TestSentence_{}.wav".format(idx)) + "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) test_audios['{}-audio'.format(idx)] = wav test_figures['{}-prediction'.format(idx)] = plot_spectrogram(postnet_output, ap) @@ -482,12 +481,11 @@ def main(args): # TODO: fix optimizer init, model.cuda() needs to be called before # optimizer restore # optimizer.load_state_dict(checkpoint['optimizer']) - if len(c.reinit_layers) > 0: + if c.reinit_layers: raise RuntimeError model.load_state_dict(checkpoint['model']) except: print(" > Partial model initialization.") - partial_init_flag = True model_dict = model.state_dict() model_dict = set_init_dict(model_dict, checkpoint, c) model.load_state_dict(model_dict) @@ -496,7 +494,6 @@ def main(args): group['lr'] = c.lr print( " > Model restored from step %d" % checkpoint['step'], flush=True) - start_epoch = checkpoint['epoch'] args.restore_step = checkpoint['step'] else: args.restore_step = 0 @@ -504,7 +501,8 @@ def main(args): if use_cuda: model = model.cuda() criterion.cuda() - if criterion_st: criterion_st.cuda(); + if criterion_st: + criterion_st.cuda() # DISTRUBUTED if num_gpus > 1: @@ -615,7 +613,7 @@ if __name__ == '__main__': os.chmod(AUDIO_PATH, 0o775) os.chmod(OUT_PATH, 0o775) - if args.rank==0: + if args.rank == 0: LOG_DIR = OUT_PATH tb_logger = Logger(LOG_DIR) @@ -629,8 +627,8 @@ if __name__ == '__main__': try: sys.exit(0) except SystemExit: - os._exit(0) - except Exception: + os._exit(0) #pylint: disable=protected-access + except Exception: #pylint: disable=broad-except remove_experiment_folder(OUT_PATH) traceback.print_exc() sys.exit(1) diff --git a/utils/audio.py b/utils/audio.py index 862b0446..657b5275 100644 --- a/utils/audio.py +++ b/utils/audio.py @@ -1,11 +1,8 @@ -import os import librosa import soundfile as sf -import pickle -import copy import numpy as np -from pprint import pprint -from scipy import signal, io +import scipy.io +import scipy.signal class AudioProcessor(object): @@ -27,7 +24,7 @@ class AudioProcessor(object): clip_norm=True, griffin_lim_iters=None, do_trim_silence=False, - **kwargs): + **_): print(" > Setting up Audio Processor...") @@ -55,7 +52,7 @@ class AudioProcessor(object): def save_wav(self, wav, path): wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) - io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16)) + scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16)) def _linear_to_mel(self, spectrogram): _mel_basis = self._build_mel_basis() @@ -78,11 +75,12 @@ class AudioProcessor(object): def _normalize(self, S): """Put values in [0, self.max_norm] or [-self.max_norm, self.max_norm]""" + #pylint: disable=no-else-return if self.signal_norm: S_norm = ((S - self.min_level_db) / - self.min_level_db) if self.symmetric_norm: S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm - if self.clip_norm : + if self.clip_norm: S_norm = np.clip(S_norm, -self.max_norm, self.max_norm) return S_norm else: @@ -95,18 +93,19 @@ class AudioProcessor(object): def _denormalize(self, S): """denormalize values""" + #pylint: disable=no-else-return S_denorm = S if self.signal_norm: if self.symmetric_norm: if self.clip_norm: - S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) + S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm) S_denorm = ((S_denorm + self.max_norm) * -self.min_level_db / (2 * self.max_norm)) + self.min_level_db return S_denorm else: if self.clip_norm: S_denorm = np.clip(S_denorm, 0, self.max_norm) S_denorm = (S_denorm * -self.min_level_db / - self.max_norm) + self.min_level_db + self.max_norm) + self.min_level_db return S_denorm else: return S @@ -122,18 +121,19 @@ class AudioProcessor(object): min_level = np.exp(self.min_level_db / 20 * np.log(10)) return 20 * np.log10(np.maximum(min_level, x)) - def _db_to_amp(self, x): + @staticmethod + def _db_to_amp(x): return np.power(10.0, x * 0.05) def apply_preemphasis(self, x): if self.preemphasis == 0: raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ") - return signal.lfilter([1, -self.preemphasis], [1], x) + return scipy.signal.lfilter([1, -self.preemphasis], [1], x) def apply_inv_preemphasis(self, x): if self.preemphasis == 0: raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ") - return signal.lfilter([1], [1, -self.preemphasis], x) + return scipy.signal.lfilter([1], [1, -self.preemphasis], x) def spectrogram(self, y): if self.preemphasis != 0: @@ -158,8 +158,7 @@ class AudioProcessor(object): # Reconstruct phase if self.preemphasis != 0: return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) - else: - return self._griffin_lim(S**self.power) + return self._griffin_lim(S**self.power) def inv_mel_spectrogram(self, mel_spectrogram): '''Converts mel spectrogram to waveform using librosa''' @@ -168,12 +167,11 @@ class AudioProcessor(object): S = self._mel_to_linear(S) # Convert back to linear if self.preemphasis != 0: return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) - else: - return self._griffin_lim(S**self.power) + return self._griffin_lim(S**self.power) def out_linear_to_mel(self, linear_spec): S = self._denormalize(linear_spec) - S = self._db_to_amp(S + self.ref_level_db) + S = self._db_to_amp(S + self.ref_level_db) S = self._linear_to_mel(np.abs(S)) S = self._amp_to_db(S) - self.ref_level_db mel = self._normalize(S) @@ -183,7 +181,7 @@ class AudioProcessor(object): angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) S_complex = np.abs(S).astype(np.complex) y = self._istft(S_complex * angles) - for i in range(self.griffin_lim_iters): + for _ in range(self.griffin_lim_iters): angles = np.exp(1j * np.angle(self._stft(y))) y = self._istft(S_complex * angles) return y @@ -240,16 +238,19 @@ class AudioProcessor(object): if self.do_trim_silence: try: x = self.trim_silence(x) - except ValueError as e: + except ValueError: print(f' [!] File cannot be trimmed for silence - {filename}') assert self.sample_rate == sr, "%s vs %s"%(self.sample_rate, sr) return x - def encode_16bits(self, x): + @staticmethod + def encode_16bits(x): return np.clip(x * 2**15, -2**15, 2**15 - 1).astype(np.int16) - def quantize(self, x, bits): + @staticmethod + def quantize(x, bits): return (x + 1.) * (2**bits - 1) / 2 - def dequantize(self, x, bits): + @staticmethod + def dequantize(x, bits): return 2 * x / (2**bits - 1) - 1 diff --git a/utils/data.py b/utils/data.py index f7f1d0ee..bbb4a31a 100644 --- a/utils/data.py +++ b/utils/data.py @@ -45,7 +45,6 @@ def prepare_stop_target(inputs, out_steps): def pad_per_step(inputs, pad_len): - timesteps = inputs.shape[-1] return np.pad( inputs, [[0, 0], [0, 0], [0, pad_len]], mode='constant', diff --git a/utils/generic_utils.py b/utils/generic_utils.py index fe7e0623..64414765 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -1,8 +1,6 @@ import os import re -import sys import glob -import time import shutil import datetime import json @@ -11,8 +9,6 @@ import subprocess import importlib import numpy as np from collections import OrderedDict, Counter -from torch.autograd import Variable -from utils.text import text_to_sequence class AttrDict(dict): @@ -78,7 +74,7 @@ def remove_experiment_folder(experiment_path): """Check folder if there is a checkpoint, otherwise remove the folder""" checkpoint_files = glob.glob(experiment_path + "/*.pth.tar") - if len(checkpoint_files) < 1: + if not checkpoint_files: if os.path.exists(experiment_path): shutil.rmtree(experiment_path) print(" ! Run is removed from {}".format(experiment_path)) @@ -87,7 +83,6 @@ def remove_experiment_folder(experiment_path): def copy_config_file(config_file, out_path, new_fields): - config_name = os.path.basename(config_file) config_lines = open(config_file, "r").readlines() # add extra information fields for key, value in new_fields.items(): diff --git a/utils/logger.py b/utils/logger.py index 2b1e262d..be523d36 100644 --- a/utils/logger.py +++ b/utils/logger.py @@ -46,7 +46,7 @@ class Logger(object): def tb_train_iter_stats(self, step, stats): self.dict_to_tb_scalar("TrainIterStats", stats, step) - + def tb_train_epoch_stats(self, step, stats): self.dict_to_tb_scalar("TrainEpochStats", stats, step) @@ -64,12 +64,9 @@ class Logger(object): def tb_eval_audios(self, step, audios, sample_rate): self.dict_to_tb_audios("EvalAudios", audios, step, sample_rate) - + def tb_test_audios(self, step, audios, sample_rate): self.dict_to_tb_audios("TestAudios", audios, step, sample_rate) def tb_test_figures(self, step, figures): self.dict_to_tb_figure("TestFigures", figures, step) - - - \ No newline at end of file diff --git a/utils/synthesis.py b/utils/synthesis.py index 541ee717..a6677bbc 100644 --- a/utils/synthesis.py +++ b/utils/synthesis.py @@ -1,11 +1,6 @@ -import io -import time -import librosa import torch import numpy as np -from .text import text_to_sequence, phoneme_to_sequence, sequence_to_phoneme -from .visual import visualize -from matplotlib import pylab as plt +from .text import text_to_sequence, phoneme_to_sequence def text_to_seqvec(text, CONFIG, use_cuda): @@ -31,8 +26,7 @@ def compute_style_mel(style_wav, ap, use_cuda): ap.load_wav(style_wav))).unsqueeze(0) if use_cuda: return style_mel.cuda() - else: - return style_mel + return style_mel def run_model(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None): @@ -84,7 +78,7 @@ def synthesis(model, style_wav=None, truncated=False, enable_eos_bos_chars=False, - trim_silence=False): + do_trim_silence=False): """Synthesize voice for the given text. Args: @@ -99,7 +93,7 @@ def synthesis(model, truncated (bool): keep model states after inference. It can be used for continuous inference at long texts. enable_eos_bos_chars (bool): enable special chars for end of sentence and start of sentence. - trim_silence (bool): trim silence after synthesis. + do_trim_silence (bool): trim silence after synthesis. """ # GST processing style_mel = None @@ -119,6 +113,6 @@ def synthesis(model, # plot results wav = inv_spectrogram(postnet_output, ap, CONFIG) # trim silence - if trim_silence: + if do_trim_silence: wav = trim_silence(wav) return wav, alignment, decoder_output, postnet_output, stop_tokens diff --git a/utils/text/__init__.py b/utils/text/__init__.py index 24433cf3..5431e46e 100644 --- a/utils/text/__init__.py +++ b/utils/text/__init__.py @@ -7,17 +7,17 @@ from utils.text import cleaners from utils.text.symbols import symbols, phonemes, _phoneme_punctuations # Mappings from symbol to numeric ID and vice versa: -_symbol_to_id = {s: i for i, s in enumerate(symbols)} -_id_to_symbol = {i: s for i, s in enumerate(symbols)} +_SYMBOL_TO_ID = {s: i for i, s in enumerate(symbols)} +_ID_TO_SYMBOL = {i: s for i, s in enumerate(symbols)} -_phonemes_to_id = {s: i for i, s in enumerate(phonemes)} -_id_to_phonemes = {i: s for i, s in enumerate(phonemes)} +_PHONEMES_TO_ID = {s: i for i, s in enumerate(phonemes)} +_ID_TO_PHONEMES = {i: s for i, s in enumerate(phonemes)} # Regular expression matching text enclosed in curly braces: -_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') +_CURLY_RE = re.compile(r'(.*?)\{(.+?)\}(.*)') # Regular expression matchinf punctuations, ignoring empty space -pat = r'['+_phoneme_punctuations+']+' +PHONEME_PUNCTUATION_PATTERN = r'['+_phoneme_punctuations+']+' def text2phone(text, language): @@ -26,11 +26,11 @@ def text2phone(text, language): ''' seperator = phonemizer.separator.Separator(' |', '', '|') #try: - punctuations = re.findall(pat, text) + punctuations = re.findall(PHONEME_PUNCTUATION_PATTERN, text) ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language) ph = ph[:-1].strip() # skip the last empty character # Replace \n with matching punctuations. - if len(punctuations) > 0: + if punctuations: # if text ends with a punctuation. if text[-1] == punctuations[-1]: for punct in punctuations[:-1]: @@ -47,20 +47,20 @@ def text2phone(text, language): def phoneme_to_sequence(text, cleaner_names, language, enable_eos_bos=False): if enable_eos_bos: - sequence = [_phonemes_to_id['^']] + sequence = [_PHONEMES_TO_ID['^']] else: sequence = [] text = text.replace(":", "") clean_text = _clean_text(text, cleaner_names) - phonemes = text2phone(clean_text, language) - if phonemes is None: + to_phonemes = text2phone(clean_text, language) + if to_phonemes is None: print("!! After phoneme conversion the result is None. -- {} ".format(clean_text)) # iterate by skipping empty strings - NOTE: might be useful to keep it to have a better intonation. - for phoneme in filter(None, phonemes.split('|')): + for phoneme in filter(None, to_phonemes.split('|')): sequence += _phoneme_to_sequence(phoneme) # Append EOS char if enable_eos_bos: - sequence.append(_phonemes_to_id['~']) + sequence.append(_PHONEMES_TO_ID['~']) return sequence @@ -68,8 +68,8 @@ def sequence_to_phoneme(sequence): '''Converts a sequence of IDs back to a string''' result = '' for symbol_id in sequence: - if symbol_id in _id_to_phonemes: - s = _id_to_phonemes[symbol_id] + if symbol_id in _ID_TO_PHONEMES: + s = _ID_TO_PHONEMES[symbol_id] result += s return result.replace('}{', ' ') @@ -89,8 +89,8 @@ def text_to_sequence(text, cleaner_names): ''' sequence = [] # Check for curly braces and treat their contents as ARPAbet: - while len(text): - m = _curly_re.match(text) + while text: + m = _CURLY_RE.match(text) if not m: sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) break @@ -105,8 +105,8 @@ def sequence_to_text(sequence): '''Converts a sequence of IDs back to a string''' result = '' for symbol_id in sequence: - if symbol_id in _id_to_symbol: - s = _id_to_symbol[symbol_id] + if symbol_id in _ID_TO_SYMBOL: + s = _ID_TO_SYMBOL[symbol_id] # Enclose ARPAbet back in curly braces: if len(s) > 1 and s[0] == '@': s = '{%s}' % s[1:] @@ -123,12 +123,12 @@ def _clean_text(text, cleaner_names): return text -def _symbols_to_sequence(symbols): - return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] +def _symbols_to_sequence(syms): + return [_SYMBOL_TO_ID[s] for s in syms if _should_keep_symbol(s)] -def _phoneme_to_sequence(phonemes): - return [_phonemes_to_id[s] for s in list(phonemes) if _should_keep_phoneme(s)] +def _phoneme_to_sequence(phons): + return [_PHONEMES_TO_ID[s] for s in list(phons) if _should_keep_phoneme(s)] def _arpabet_to_sequence(text): @@ -136,8 +136,8 @@ def _arpabet_to_sequence(text): def _should_keep_symbol(s): - return s in _symbol_to_id and s not in ['~', '^', '_'] + return s in _SYMBOL_TO_ID and s not in ['~', '^', '_'] def _should_keep_phoneme(p): - return p in _phonemes_to_id and p not in ['~', '^', '_'] + return p in _PHONEMES_TO_ID and p not in ['~', '^', '_'] diff --git a/utils/text/cmudict.py b/utils/text/cmudict.py index 1202bf3d..c0f23406 100644 --- a/utils/text/cmudict.py +++ b/utils/text/cmudict.py @@ -2,16 +2,16 @@ import re -# valid_symbols = [ -# 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', -# 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', -# 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', -# 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', -# 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', -# 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', -# 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', -# 'Y', 'Z', 'ZH' -# ] +VALID_SYMBOLS = [ + 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', + 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', + 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', + 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', + 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', + 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', + 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', + 'Y', 'Z', 'ZH' +] class CMUDict: @@ -37,19 +37,19 @@ class CMUDict: '''Returns list of ARPAbet pronunciations of the given word.''' return self._entries.get(word.upper()) - def get_arpabet(self, word, cmudict, punctuation_symbols): + @staticmethod + def get_arpabet(word, cmudict, punctuation_symbols): first_symbol, last_symbol = '', '' - if len(word) > 0 and word[0] in punctuation_symbols: + if word and word[0] in punctuation_symbols: first_symbol = word[0] word = word[1:] - if len(word) > 0 and word[-1] in punctuation_symbols: + if word and word[-1] in punctuation_symbols: last_symbol = word[-1] word = word[:-1] arpabet = cmudict.lookup(word) if arpabet is not None: return first_symbol + '{%s}' % arpabet[0] + last_symbol - else: - return first_symbol + word + last_symbol + return first_symbol + word + last_symbol _alt_re = re.compile(r'\([0-9]+\)') @@ -58,7 +58,7 @@ _alt_re = re.compile(r'\([0-9]+\)') def _parse_cmudict(file): cmudict = {} for line in file: - if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): + if line and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): parts = line.split(' ') word = re.sub(_alt_re, '', parts[0]) pronunciation = _get_pronunciation(parts[1]) @@ -73,6 +73,6 @@ def _parse_cmudict(file): def _get_pronunciation(s): parts = s.strip().split(' ') for part in parts: - if part not in _valid_symbol_set: + if part not in VALID_SYMBOLS: return None return ' '.join(parts) diff --git a/utils/text/number_norm.py b/utils/text/number_norm.py index 9cc6f4df..d3d9a46b 100644 --- a/utils/text/number_norm.py +++ b/utils/text/number_norm.py @@ -66,14 +66,13 @@ def _expand_dollars(m): dollar_unit = 'dollar' if dollars == 1 else 'dollars' cent_unit = 'cent' if cents == 1 else 'cents' return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) - elif dollars: + if dollars: dollar_unit = 'dollar' if dollars == 1 else 'dollars' return '%s %s' % (dollars, dollar_unit) - elif cents: + if cents: cent_unit = 'cent' if cents == 1 else 'cents' return '%s %s' % (cents, cent_unit) - else: - return 'zero dollars' + return 'zero dollars' def _standard_number_to_words(n, digit_group): @@ -99,12 +98,11 @@ def _number_to_words(n): # Handle special cases first, then go to the standard case: if n >= 1000000000000000000: return str(n) # Too large, just return the digits - elif n == 0: + if n == 0: return 'zero' - elif n % 100 == 0 and n % 1000 != 0 and n < 3000: + if n % 100 == 0 and n % 1000 != 0 and n < 3000: return _standard_number_to_words(n // 100, 0) + ' hundred' - else: - return _standard_number_to_words(n, 0) + return _standard_number_to_words(n, 0) def _expand_number(m): diff --git a/utils/visual.py b/utils/visual.py index 9fd7a790..982fa53a 100644 --- a/utils/visual.py +++ b/utils/visual.py @@ -1,4 +1,3 @@ -import numpy as np import librosa import matplotlib matplotlib.use('Agg') @@ -49,7 +48,7 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON print(text) plt.yticks(range(len(text)), list(text)) plt.colorbar() - + stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy() plt.subplot(num_plot, 1, 2) plt.plot(range(len(stop_tokens)), list(stop_tokens)) @@ -65,12 +64,12 @@ def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CON if spectrogram is not None: plt.subplot(num_plot, 1, 4) librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'], - hop_length=hop_length, x_axis="time", y_axis="linear") + hop_length=hop_length, x_axis="time", y_axis="linear") plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() - + if output_path: print(output_path) fig.savefig(output_path)