From 062e8a0880e895816a66c775aff424c053c9be63 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 13 Dec 2018 18:18:37 +0100 Subject: [PATCH 01/17] logger for tensorboard plotting --- config.json | 1 + requirements.txt | 3 +- train.py | 122 ++++++++++++++++------------------------------- utils/logger.py | 75 +++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+), 83 deletions(-) create mode 100644 utils/logger.py diff --git a/config.json b/config.json index e67d9a4f..bd49f6e8 100644 --- a/config.json +++ b/config.json @@ -40,6 +40,7 @@ "checkpoint": true, "save_step": 5000, "print_step": 10, + "tb_model_param_stats": true, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. "run_eval": true, "data_path": "../../Data/LJSpeech-1.1/", // can overwritten from command argument diff --git a/requirements.txt b/requirements.txt index 73e5dae7..e49445d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,5 +8,4 @@ tensorboardX matplotlib==2.0.2 Pillow flask -scipy==0.19.0 -lws \ No newline at end of file +scipy==0.19.0 \ No newline at end of file diff --git a/train.py b/train.py index 8fe07ded..3d4212bc 100644 --- a/train.py +++ b/train.py @@ -22,6 +22,7 @@ from models.tacotron import Tacotron from layers.losses import L1LossMasked from utils.audio import AudioProcessor from utils.synthesis import synthesis +from utils.logger import Logger torch.manual_seed(1) use_cuda = torch.cuda.is_available() @@ -169,15 +170,13 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, avg_step_time += step_time # Plot Training Iter Stats - tb.add_scalar('TrainIterLoss/TotalLoss', loss.item(), current_step) - tb.add_scalar('TrainIterLoss/LinearLoss', linear_loss.item(), - current_step) - tb.add_scalar('TrainIterLoss/MelLoss', mel_loss.item(), current_step) - tb.add_scalar('Params/LearningRate', optimizer.param_groups[0]['lr'], - current_step) - tb.add_scalar('Params/GradNorm', grad_norm, current_step) - tb.add_scalar('Params/GradNormSt', grad_norm_st, current_step) - tb.add_scalar('Time/StepTime', step_time, current_step) + iter_stats = {"loss_posnet": linear_loss.item(), + "loss_decoder": mel_loss.item(), + "lr": current_lr, + "grad_norm": grad_norm, + "grad_norm_st": grad_norm_st, + "step_time": step_time} + tb_logger.tb_train_iter_stats(current_step, iter_stats) if current_step % c.save_step == 0: if c.checkpoint: @@ -189,28 +188,17 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, # Diagnostic visualizations const_spec = linear_output[0].data.cpu().numpy() gt_spec = linear_input[0].data.cpu().numpy() - - const_spec = plot_spectrogram(const_spec, ap) - gt_spec = plot_spectrogram(gt_spec, ap) - tb.add_figure('Visual/Reconstruction', const_spec, current_step) - tb.add_figure('Visual/GroundTruth', gt_spec, current_step) - align_img = alignments[0].data.cpu().numpy() - align_img = plot_alignment(align_img) - tb.add_figure('Visual/Alignment', align_img, current_step) + + figures = {"prediction": plot_spectrogram(const_spec, ap), + "ground_truth": plot_spectrogram(gt_spec, ap), + "alignment": plot_alignment(align_img)} + tb_logger.tb_train_figures(figures, current_step) # Sample audio - audio_signal = linear_output[0].data.cpu().numpy() - ap.griffin_lim_iters = 60 - audio_signal = ap.inv_spectrogram(audio_signal.T) - try: - tb.add_audio( - 'SampleAudio', - audio_signal, - current_step, - sample_rate=c.sample_rate) - except: - pass + tb_logger.tb_train_audios(current_step, + {'TrainAudio': ap.inv_spectrogram(const_spec.T)}, + c.sample_rate) avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) @@ -229,12 +217,13 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, flush=True) # Plot Training Epoch Stats - tb.add_scalar('TrainEpochLoss/TotalLoss', avg_total_loss, current_step) - tb.add_scalar('TrainEpochLoss/LinearLoss', avg_linear_loss, current_step) - tb.add_scalar('TrainEpochLoss/MelLoss', avg_mel_loss, current_step) - tb.add_scalar('TrainEpochLoss/StopLoss', avg_stop_loss, current_step) - tb.add_scalar('Time/EpochTime', epoch_time, epoch) - epoch_time = 0 + epoch_stats = {"loss_postnet": avg_linear_loss, + "loss_decoder": avg_mel_loss, + "stop_loss": avg_stop_loss, + "epoch_time": epoch_time} + tb_logger.tb_train_epoch_stats(current_step, epoch_stats) + if c.tb_model_param_stats: + tb_logger.tb_model_weights(model, current_step) return avg_linear_loss, current_step @@ -316,74 +305,45 @@ def evaluate(model, criterion, criterion_st, ap, current_step): gt_spec = linear_input[idx].data.cpu().numpy() align_img = alignments[idx].data.cpu().numpy() - const_spec = plot_spectrogram(const_spec, ap) - gt_spec = plot_spectrogram(gt_spec, ap) - align_img = plot_alignment(align_img) - - tb.add_figure('ValVisual/Reconstruction', const_spec, current_step) - tb.add_figure('ValVisual/GroundTruth', gt_spec, current_step) - tb.add_figure('ValVisual/ValidationAlignment', align_img, - current_step) + eval_figures = {"prediction": plot_spectrogram(const_spec, ap), + "ground_truth": plot_spectrogram(gt_spec, ap), + "alignment": plot_alignment(align_img)} + tb_logger.tb_eval_figures(current_step, eval_figures) # Sample audio - audio_signal = linear_output[idx].data.cpu().numpy() - ap.griffin_lim_iters = 60 - audio_signal = ap.inv_spectrogram(audio_signal.T) - try: - tb.add_audio( - 'ValSampleAudio', - audio_signal, - current_step, - sample_rate=c.audio["sample_rate"]) - except: - # sometimes audio signal is out of boundaries - pass + tb_logger.tb_eval_audios(current_step, {"ValAudio": ap.inv_spectrogram(const_spec.T)}, c.audio["sample_rate"]) # compute average losses avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) avg_stop_loss /= (num_iter + 1) - avg_total_loss = avg_mel_loss + avg_linear_loss + avg_stop_loss - # Plot Learning Stats - tb.add_scalar('ValEpochLoss/TotalLoss', avg_total_loss, - current_step) - tb.add_scalar('ValEpochLoss/LinearLoss', avg_linear_loss, - current_step) - tb.add_scalar('ValEpochLoss/MelLoss', avg_mel_loss, current_step) - tb.add_scalar('ValEpochLoss/Stop_loss', avg_stop_loss, - current_step) + # Plot Validation Stats + epoch_stats = {"loss_postnet": avg_linear_loss, + "loss_decoder": avg_mel_loss, + "stop_loss": avg_stop_loss} + tb_logger.tb_eval_stats(current_step, epoch_stats) # test sentences - ap.griffin_lim_iters = 60 + test_audios = {} + test_figures = {} for idx, test_sentence in enumerate(test_sentences): try: wav, alignment, linear_spec, _, stop_tokens = synthesis( model, test_sentence, c, use_cuda, ap) - file_path = os.path.join(AUDIO_PATH, str(current_step)) os.makedirs(file_path, exist_ok=True) file_path = os.path.join(file_path, "TestSentence_{}.wav".format(idx)) ap.save_wav(wav, file_path) - - wav_name = 'TestSentences/{}'.format(idx) - tb.add_audio( - wav_name, - wav, - current_step, - sample_rate=c.audio['sample_rate']) - - linear_spec = plot_spectrogram(linear_spec, ap) - align_img = plot_alignment(alignment) - tb.add_figure('TestSentences/{}_Spectrogram'.format(idx), - linear_spec, current_step) - tb.add_figure('TestSentences/{}_Alignment'.format(idx), align_img, - current_step) + test_audios['{}-audio'.format(idx)] = wav + test_figures['{}-prediction'.format(idx)] = plot_spectrogram(linear_spec, ap) + test_figures['{}-alignment'.format(idx)] = plot_alignment(alignment) except: print(" !! Error creating Test Sentence -", idx) traceback.print_exc() - pass + tb_logger.tb_test_audios(current_step, test_audios, c.audio['sample_rate']) + tb_logger.tb_test_figures(current_step, test_figures) return avg_linear_loss @@ -496,7 +456,7 @@ if __name__ == '__main__': # setup tensorboard LOG_DIR = OUT_PATH - tb = SummaryWriter(LOG_DIR) + tb_logger = Logger(LOG_DIR) # Conditional imports preprocessor = importlib.import_module('datasets.preprocess') diff --git a/utils/logger.py b/utils/logger.py new file mode 100644 index 00000000..c8cfcf28 --- /dev/null +++ b/utils/logger.py @@ -0,0 +1,75 @@ +import traceback +from tensorboardX import SummaryWriter + + +class Logger(object): + def __init__(self, log_dir): + self.writer = SummaryWriter(log_dir) + self.train_stats = {} + self.eval_stats = {} + + def tb_model_weights(self, model, step): + layer_num = 1 + for name, param in model.named_parameters(): + self.writer.add_scalar( + "layer{}-ModelParams/{}/max".format(layer_num, name), + param.max(), step) + self.writer.add_scalar( + "layer{}-ModelParams/{}/min".format(layer_num, name), + param.min(), step) + self.writer.add_scalar( + "layer{}-ModelParams/{}/mean".format(layer_num, name), + param.mean(), step) + self.writer.add_scalar( + "layer{}-ModelParams/{}/std".format(layer_num, name), + param.std(), step) + self.writer.add_histogram( + "layer{}-{}/param".format(layer_num, name), param, step) + self.writer.add_histogram( + "layer{}-{}/grad".format(layer_num, name), param.grad, step) + layer_num += 1 + + def dict_to_tb_scalar(self, scope_name, stats, step): + for key, value in stats.items(): + self.writer.add_scalar('{}/{}'.format(scope_name, key), value, step) + + def dict_to_tb_figure(self, scope_name, figures, step): + for key, value in figures.items(): + self.writer.add_figure('{}/{}'.format(scope_name, key), value, step) + + def dict_to_tb_audios(self, scope_name, audios, step, sample_rate): + for key, value in audios.items(): + try: + self.writer.add_audio('{}/{}'.format(scope_name, key), value, step, sample_rate=sample_rate) + except: + traceback.print_exc() + + def tb_train_iter_stats(self, step, stats): + self.dict_to_tb_scalar("TrainIterStats", stats, step) + + def tb_train_epoch_stats(self, step, stats): + self.dict_to_tb_scalar("TrainEpochStats", stats, step) + + def tb_train_figures(self, step, figures): + self.dict_to_tb_figure("TrainFigures", figures, step) + + def tb_train_audios(self, step, audios, sample_rate): + self.dict_to_tb_audios("TrainAudios", audios, step, sample_rate) + + def tb_eval_stats(self, step, stats): + self.dict_to_tb_scalar("EvalStats", stats, step) + + def tb_eval_figures(self, step, figures): + self.dict_to_tb_figure("EvalFigures", figures, step) + + def tb_eval_audios(self, step, audios, sample_rate): + self.dict_to_tb_audios("EvalAudios", audios, step, sample_rate) + + def tb_test_audios(self, step, audios, sample_rate): + self.dict_to_tb_audios("TestAudios", audios, step, sample_rate) + + def tb_test_figures(self, step, figures): + self.dict_to_tb_figure("TestFigures", figures, step) + + + \ No newline at end of file From 3ad38621d46970386cfba048816bbc0c501cea9e Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Thu, 13 Dec 2018 18:19:02 +0100 Subject: [PATCH 02/17] bug fix for partial model initialization, if model is not initialized, it is tried to init model partially with only matching layers in size --- train.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/train.py b/train.py index 3d4212bc..37b48cba 100644 --- a/train.py +++ b/train.py @@ -360,17 +360,21 @@ def main(args): if args.restore_path: checkpoint = torch.load(args.restore_path) - model.load_state_dict(checkpoint['model']) - # Partial initialization: if there is a mismatch with new and old layer, it is skipped. - # 1. filter out unnecessary keys - pretrained_dict = { - k: v - for k, v in checkpoint['model'].items() if k in model_dict - } - # 2. overwrite entries in the existing state dict - model_dict.update(pretrained_dict) - # 3. load the new state dict - model.load_state_dict(model_dict) + try: + model.load_state_dict(checkpoint['model']) + except: + print(" > Partial model initialization.") + model_dict = model.state_dict() + # Partial initialization: if there is a mismatch with new and old layer, it is skipped. + # 1. filter out unnecessary keys + pretrained_dict = { + k: v + for k, v in checkpoint['model'].items() if k in model_dict + } + # 2. overwrite entries in the existing state dict + model_dict.update(pretrained_dict) + # 3. load the new state dict + model.load_state_dict(model_dict) if use_cuda: model = model.cuda() criterion.cuda() From 7d2ef7fbbce132c4de1374b1e087db1813f8f448 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Fri, 14 Dec 2018 11:02:58 +0100 Subject: [PATCH 03/17] bug fix --- train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/train.py b/train.py index 37b48cba..783cf5b8 100644 --- a/train.py +++ b/train.py @@ -193,7 +193,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, figures = {"prediction": plot_spectrogram(const_spec, ap), "ground_truth": plot_spectrogram(gt_spec, ap), "alignment": plot_alignment(align_img)} - tb_logger.tb_train_figures(figures, current_step) + tb_logger.tb_train_figures(current_step, figures) # Sample audio tb_logger.tb_train_audios(current_step, From fb35fd0f35e88e3cc333ee87a83309a8b2b7f2c9 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 17 Dec 2018 16:32:04 +0100 Subject: [PATCH 04/17] README update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8d6fae6e..1482cdab 100644 --- a/README.md +++ b/README.md @@ -108,7 +108,7 @@ Please feel free to offer new changes and pull things off. We are happy to discu - Punctuations at the end of a sentence sometimes affect the pronounciation of the last word. Because punctuation sign is attended by the attention module , that forces network to create a voice signal or at least modify the voice signal being generated for neighboring frames. - ~~Simpler stop-token prediction. Right now we use RNN to keep the history of the previous frames. However, we never tested, if something simpler would work as well.~~ Yet RNN based model gives more stable predictions. - Train for better mel-specs. Mel-spectrograms are not good enough to be fed Neural Vocoder. Easy solution to this problem is to train the model with r=1. However,in this case model struggles to align the attention. -- irregular words: "minute", "focus", "aren't" etc. Even though, ~~it might be solved~~ (Nancy dataset give much better results compared to LJSpeech) it is solved by a larger or better dataset, some of irregular words cause network to mis-pronounce. Irregular means in this context is that written form and pronounciation of a word have a unique disparity. +- irregular words: "minute", "focus", "aren't" etc. Even though, ~~it might be solved~~ (Nancy dataset delivers much better quality compared to LJSpeech) it is solved by a larger or a better dataset, some of irregular words cause network to mispronounce. ## Major TODOs - [x] Implement the model. From d38872a4d0b4d902d6ce3979560d0c4f4258ac82 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 17 Dec 2018 16:32:45 +0100 Subject: [PATCH 05/17] Add cached option to TTSDataset.py, depricating TTSDatasetCached --- datasets/TTSDataset.py | 83 +++++++++++++++++++++++++++++------- datasets/TTSDatasetCached.py | 4 +- 2 files changed, 69 insertions(+), 18 deletions(-) diff --git a/datasets/TTSDataset.py b/datasets/TTSDataset.py index a9d111b0..e97b38af 100644 --- a/datasets/TTSDataset.py +++ b/datasets/TTSDataset.py @@ -20,7 +20,27 @@ class MyDataset(Dataset): ap, preprocessor, batch_group_size=0, - min_seq_len=0): + min_seq_len=0, + max_seq_len=float("inf"), + cached=False): + """ + Args: + root_path (str): root path for the data folder. + meta_file (str): name for dataset file including audio transcripts + and file names (or paths in cached mode). + outputs_per_step (int): number of time frames predicted per step. + text_cleaner (str): text cleaner used for the dataset. + ap (TTS.utils.AudioProcessor): audio processor object. + preprocessor (dataset.preprocess.Class): preprocessor for the dataset. + Create your own if you need to run a new dataset. + batch_group_size (int): (0) range of batch randomization after sorting + sequences by length. + min_seq_len (int): (0) minimum sequence length to be processed + by the loader. + max_seq_len (int): (float("inf")) maximum sequence length. + cached (bool): (false) true if the given data path is created + by extract_features.py. + """ self.root_path = root_path self.batch_group_size = batch_group_size self.items = preprocessor(root_path, meta_file) @@ -28,9 +48,14 @@ class MyDataset(Dataset): self.sample_rate = ap.sample_rate self.cleaners = text_cleaner self.min_seq_len = min_seq_len + self.max_seq_len = max_seq_len self.ap = ap - print(" > Reading LJSpeech from - {}".format(root_path)) + self.cached = cached + print(" > DataLoader initialization") + print(" | > Data path: {}".format(root_path)) + print(" | > Cached dataset: {}".format(self.cached)) print(" | > Number of instances : {}".format(len(self.items))) + self.sort_items() def load_wav(self, filename): @@ -40,24 +65,51 @@ class MyDataset(Dataset): except RuntimeError as e: print(" !! Cannot read file : {}".format(filename)) + def load_np(self, filename): + data = np.load(filename).astype('float32') + return data + + def load_data(self, idx): + if self.cached: + wav_name = self.items[idx][1] + mel_name = self.items[idx][2] + linear_name = self.items[idx][3] + text = self.items[idx][0] + text = np.asarray( + text_to_sequence(text, [self.cleaners]), dtype=np.int32) + if wav_name.split('.')[-1] == 'npy': + wav = self.load_np(wav_name) + else: + wav = np.asarray(self.load_wav(wav_name), dtype=np.float32) + mel = self.load_np(mel_name) + linear = self.load_np(linear_name) + sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1], 'mel':mel, 'linear': linear} + else: + text, wav_file = self.items[idx] + text = np.asarray( + text_to_sequence(text, [self.cleaners]), dtype=np.int32) + wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) + sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1]} + return sample + def sort_items(self): - r"""Sort text sequences in ascending order""" + r"""Sort instances based on text length in ascending order""" lengths = np.array([len(ins[0]) for ins in self.items]) - print(" | > Max length sequence {}".format(np.max(lengths))) - print(" | > Min length sequence {}".format(np.min(lengths))) - print(" | > Avg length sequence {}".format(np.mean(lengths))) + print(" | > Max length sequence: {}".format(np.max(lengths))) + print(" | > Min length sequence: {}".format(np.min(lengths))) + print(" | > Avg length sequence: {}".format(np.mean(lengths))) idxs = np.argsort(lengths) new_items = [] ignored = [] for i, idx in enumerate(idxs): length = lengths[idx] - if length < self.min_seq_len: + if length < self.min_seq_len or length > self.max_seq_len: ignored.append(idx) else: new_items.append(self.items[idx]) - print(" | > {} instances are ignored by min_seq_len ({})".format( + print(" | > {} instances are ignored ({})".format( len(ignored), self.min_seq_len)) # shuffle batch groups if self.batch_group_size > 0: @@ -74,12 +126,7 @@ class MyDataset(Dataset): return len(self.items) def __getitem__(self, idx): - text, wav_file = self.items[idx] - text = np.asarray( - text_to_sequence(text, [self.cleaners]), dtype=np.int32) - wav = np.asarray(self.load_wav(wav_file), dtype=np.float32) - sample = {'text': text, 'wav': wav, 'item_idx': self.items[idx][1]} - return sample + return self.load_data(idx) def collate_fn(self, batch): r""" @@ -101,8 +148,12 @@ class MyDataset(Dataset): text_lenghts = np.array([len(x) for x in text]) max_text_len = np.max(text_lenghts) - linear = [self.ap.spectrogram(w).astype('float32') for w in wav] - mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] + if self.cached: + mel = [d['mel'] for d in batch] + linear = [d['linear'] for d in batch] + else: + mel = [self.ap.melspectrogram(w).astype('float32') for w in wav] + linear = [self.ap.spectrogram(w).astype('float32') for w in wav] mel_lengths = [m.shape[1] + 1 for m in mel] # +1 for zero-frame # compute 'stop token' targets diff --git a/datasets/TTSDatasetCached.py b/datasets/TTSDatasetCached.py index b5c6d4ce..28033a80 100644 --- a/datasets/TTSDatasetCached.py +++ b/datasets/TTSDatasetCached.py @@ -151,8 +151,8 @@ class MyDataset(Dataset): # convert things to pytorch text_lenghts = torch.LongTensor(text_lenghts) text = torch.LongTensor(text) - linear = torch.FloatTensor(linear) - mel = torch.FloatTensor(mel) + linear = torch.FloatTensor(linear).contiguous() + mel = torch.FloatTensor(mel).contiguous() mel_lengths = torch.LongTensor(mel_lengths) stop_targets = torch.FloatTensor(stop_targets) From 4587c72a03a93c402cb8b735612159b37ba0ee0d Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 17 Dec 2018 16:33:29 +0100 Subject: [PATCH 06/17] Add preprocessor for TWEB dataset --- datasets/preprocess.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/datasets/preprocess.py b/datasets/preprocess.py index 89d464a0..67c184ef 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -13,10 +13,20 @@ def tts_cache(root_path, meta_file): return items -# def tweb(root_path, meta_file): -# # TODO -# pass -# return +def tweb(root_path, meta_file): + """Normalize TWEB dataset. + https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset + """ + txt_file = os.path.join(root_path, meta_file) + items = [] + with open(txt_file, 'r') as ttf: + for line in ttf: + cols = line.split('\t') + wav_file = os.path.join(root_path, cols[0]+'.wav') + text = cols[1] + items.append([text, wav_file]) + random.shuffle(items) + return items # def kusal(root_path, meta_file): From b42e3d12a8bd9fc25a455ad969ba47113ac6119b Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 17 Dec 2018 16:34:24 +0100 Subject: [PATCH 07/17] update extract_features.py and the order of columns in generated dataset file --- extract_features.py | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/extract_features.py b/extract_features.py index 56629d1d..fad1f899 100644 --- a/extract_features.py +++ b/extract_features.py @@ -18,9 +18,7 @@ from multiprocessing import Pool if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--data_path', type=str, help='Data folder.') - parser.add_argument('--cache_path', type=str, help='Cache folder, place to output all the intermediate spectrogram files.') - # parser.add_argument('--keep_cache', type=bool, help='If True, it keeps the cache folder.') - # parser.add_argument('--hdf5_path', type=str, help='hdf5 folder.') + parser.add_argument('--cache_path', type=str, help='Cache folder, place to output all the spectrogram files.') parser.add_argument( '--config', type=str, help='conf.json file for run settings.') parser.add_argument( @@ -49,24 +47,14 @@ if __name__ == "__main__": print(" > Input path: ", DATA_PATH) print(" > Cache path: ", CACHE_PATH) - # audio = importlib.import_module('utils.' + c.audio_processor) - # AudioProcessor = getattr(audio, 'AudioProcessor') ap = AudioProcessor(**CONFIG.audio) - def trim_silence(self, wav): - """ Trim silent parts with a threshold and 0.1 sec margin """ - margin = int(ap.sample_rate * 0.1) - wav = wav[margin:-margin] - return librosa.effects.trim( - wav, top_db=40, frame_length=1024, hop_length=256)[0] def extract_mel(item): """ Compute spectrograms, length information """ text = item[0] file_path = item[1] x = ap.load_wav(file_path, ap.sample_rate) - if args.trim_silence: - x = trim_silence(x) file_name = os.path.basename(file_path).replace(".wav", "") mel_file = file_name + "_mel" mel_path = os.path.join(CACHE_PATH, 'mel', mel_file) @@ -74,20 +62,20 @@ if __name__ == "__main__": np.save(mel_path, mel, allow_pickle=False) mel_len = mel.shape[1] wav_len = x.shape[0] - output = [file_path, mel_path+".npy", str(wav_len), str(mel_len), text] + output = [text, file_path, mel_path+".npy", str(wav_len), str(mel_len)] if not args.only_mel: linear_file = file_name + "_linear" linear_path = os.path.join(CACHE_PATH, 'linear', linear_file) linear = ap.spectrogram(x.astype('float32')).astype('float32') linear_len = linear.shape[1] np.save(linear_path, linear, allow_pickle=False) - output.insert(2, linear_path+".npy") + output.insert(3, linear_path+".npy") if args.process_audio: audio_file = file_name + "_audio" audio_path = os.path.join(CACHE_PATH, 'audio', audio_file) np.save(audio_path, x, allow_pickle=False) del output[0] - output.insert(0, audio_path+".npy") + output.insert(1, audio_path+".npy") assert mel_len == linear_len return output From 3cb1f5d4ccb3fddf91d0debdb6ad0a59fe5ad8ec Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 17 Dec 2018 16:35:52 +0100 Subject: [PATCH 08/17] update loader_tests.py --- tests/loader_tests.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/loader_tests.py b/tests/loader_tests.py index c945a592..7fc003a1 100644 --- a/tests/loader_tests.py +++ b/tests/loader_tests.py @@ -23,6 +23,9 @@ if not os.path.exists(c.data_path_cache): if not os.path.exists(c.data_path): DATA_EXIST = False +print(" > Dynamic data loader test: {}".format(DATA_EXIST)) +print(" > Cache data loader test: {}".format(CACHE_EXIST)) + class TestTTSDataset(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestTTSDataset, self).__init__(*args, **kwargs) @@ -199,7 +202,7 @@ class TestTTSDatasetCached(unittest.TestCase): def _create_dataloader(self, batch_size, r, bgs): - dataset = TTSDatasetCached.MyDataset( + dataset = TTSDataset.MyDataset( c.data_path_cache, 'tts_metadata.csv', r, @@ -207,7 +210,9 @@ class TestTTSDatasetCached(unittest.TestCase): preprocessor=tts_cache, ap=self.ap, batch_group_size=bgs, - min_seq_len=c.min_seq_len) + min_seq_len=c.min_seq_len, + max_seq_len=c.max_seq_len, + cached=True) dataloader = DataLoader( dataset, @@ -299,11 +304,17 @@ class TestTTSDatasetCached(unittest.TestCase): abs(mel.T).astype("float32") - abs(mel_dl[:-1])).sum() # check mel-spec correctness - mel_spec = mel_input[0].cpu().numpy() + mel_spec = mel_input[-1].cpu().numpy() wav = self.ap.inv_mel_spectrogram(mel_spec.T) self.ap.save_wav(wav, OUTPATH + '/mel_inv_dataloader_cache.wav') - shutil.copy(item_idx[0], OUTPATH + '/mel_target_dataloader_cache.wav') + shutil.copy(item_idx[-1], OUTPATH + '/mel_target_dataloader_cache.wav') + + # check linear-spec + linear_spec = linear_input[-1].cpu().numpy() + wav = self.ap.inv_spectrogram(linear_spec.T) + self.ap.save_wav(wav, OUTPATH + '/linear_inv_dataloader_cache.wav') + shutil.copy(item_idx[-1], OUTPATH + '/linear_target_dataloader_cache.wav') # check the last time step to be zero padded assert mel_input[0, -1].sum() == 0 From 2a4adf0c3373485d30ca32b1c53384094935e2d6 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 17 Dec 2018 16:36:19 +0100 Subject: [PATCH 09/17] update test_config.json --- tests/test_config.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_config.json b/tests/test_config.json index 05a8137d..7283664c 100644 --- a/tests/test_config.json +++ b/tests/test_config.json @@ -16,7 +16,8 @@ "clip_norm": true, // clip normalized values into the range. "max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm] "mel_fmin": 95, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! - "mel_fmax": 7600 // maximum freq level for mel-spec. Tune for dataset!! + "mel_fmax": 7600, // maximum freq level for mel-spec. Tune for dataset!! + "do_trim_silence": false }, "hidden_size": 128, "embedding_size": 256, @@ -34,8 +35,9 @@ "save_step": 200, "data_path": "/home/erogol/Data/LJSpeech-1.1/", - "data_path_cache": "/home/erogol/Data/LJSpeech-1.1/tts_cache/", + "data_path_cache": "/media/erogol/data_ssd/Data/Nancy/tts_cache/", "output_path": "result", "min_seq_len": 0, + "max_seq_len": 300, "log_dir": "/home/erogol/projects/TTS/logs/" } From 8ff9253abd027af0e5cefc85cd878baa1d13295d Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 17 Dec 2018 16:37:06 +0100 Subject: [PATCH 10/17] bug fixes on train.py --- train.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/train.py b/train.py index 783cf5b8..7dd5b78d 100644 --- a/train.py +++ b/train.py @@ -43,7 +43,9 @@ def setup_loader(is_val=False): preprocessor=preprocessor, ap=ap, batch_group_size=0 if is_val else 8 * c.batch_size, - min_seq_len=0 if is_val else c.min_seq_len) + min_seq_len=0 if is_val else c.min_seq_len, + max_seq_len=float("inf") if is_val else c.max_seq_len + cached=False if c.dataset ~= "tts_cache" else True) loader = DataLoader( dataset, batch_size=c.eval_batch_size if is_val else c.batch_size, @@ -164,8 +166,8 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, grad_norm, grad_norm_st, avg_text_length, avg_spec_length, step_time, current_lr), flush=True) - avg_linear_loss += linear_loss.item() - avg_mel_loss += mel_loss.item() + avg_linear_loss += float(linear_loss.item()) + avg_mel_loss += float(mel_loss.item()) avg_stop_loss += stop_loss.item() avg_step_time += step_time @@ -198,7 +200,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, # Sample audio tb_logger.tb_train_audios(current_step, {'TrainAudio': ap.inv_spectrogram(const_spec.T)}, - c.sample_rate) + c.audio["sample_rate"]) avg_linear_loss /= (num_iter + 1) avg_mel_loss /= (num_iter + 1) @@ -295,8 +297,8 @@ def evaluate(model, criterion, criterion_st, ap, current_step): stop_loss.item()), flush=True) - avg_linear_loss += linear_loss.item() - avg_mel_loss += mel_loss.item() + avg_linear_loss += float(linear_loss.item()) + avg_mel_loss += float(mel_loss.item()) avg_stop_loss += stop_loss.item() # Diagnostic visualizations @@ -442,7 +444,7 @@ if __name__ == '__main__': default=False, help='Do not verify commit integrity to run training.') parser.add_argument( - '--data_path', type=str, help='dataset path.', default='Defines the data path. It overwrites config.json.') + '--data_path', type=str, default='', default='Defines the data path. It overwrites config.json.') args = parser.parse_args() # setup output paths and read configs From c245cb4f75db6a5cf9622432a441227f23c96c06 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 17 Dec 2018 16:37:39 +0100 Subject: [PATCH 11/17] configpy updates including TTSDataset cached mode --- config.json | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/config.json b/config.json index bd49f6e8..b8ce9a90 100644 --- a/config.json +++ b/config.json @@ -1,5 +1,5 @@ { - "model_name": "TTS-master", + "model_name": "TTS-dev-tweb", "model_description": "Higher dropout rate for stopnet and disabled custom initialization, pull current mel prediction to stopnet.", "audio":{ @@ -22,18 +22,18 @@ "clip_norm": true, // clip normalized values into the range. "mel_fmin": null, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!! "mel_fmax": null, // maximum freq level for mel-spec. Tune for dataset!! - "do_trim_silence": true // enable trimming of slience of audio as you load it. + "do_trim_silence": false // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true) }, "embedding_size": 256, "text_cleaner": "english_cleaners", "epochs": 1000, - "lr": 0.0001, + "lr": 0.001, "lr_decay": false, "warmup_steps": 4000, - "batch_size": 32, + "batch_size": 20, "eval_batch_size":32, "r": 5, "wd": 0.000001, @@ -44,12 +44,12 @@ "run_eval": true, "data_path": "../../Data/LJSpeech-1.1/", // can overwritten from command argument - "meta_file_train": "prompts_train.data", // metafile for training dataloader - "meta_file_val": "prompts_val.data", // metafile for validation dataloader - "data_loader": "TTSDataset", // dataloader, ["TTSDataset", "TTSDatasetCached", "TTSDatasetMemory"] - "dataset": "nancy", // one of TTS.dataset.preprocessors, only valid id dataloader == "TTSDataset", rest uses "tts_cache" by default. - "min_seq_len": 0, - "output_path": "../keep/", - "num_loader_workers": 8, - "num_val_loader_workers": 4 + "meta_file_train": "transcript.txt", // metafile for training dataloader. + "meta_file_val": "", // metafile for evaluation dataloader. + "dataset": "tweb", // one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py + "min_seq_len": 0, // minimum text length to use in training + "max_seq_len": 300, // maximum text length + "output_path": "../keep/", // output path for all training outputs. + "num_loader_workers": 8, // number of training data loader processes. Don't set it too big. 4-8 are good values. + "num_val_loader_workers": 4 // number of evaluation data loader processes. } \ No newline at end of file From 22664a52c85cf3fc0ffd2fa35e492c4d133f67e1 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 17 Dec 2018 16:38:12 +0100 Subject: [PATCH 12/17] Place model name to the beginning of the generated output folder name --- utils/generic_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/generic_utils.py b/utils/generic_utils.py index 8bf348dd..7d4c961f 100644 --- a/utils/generic_utils.py +++ b/utils/generic_utils.py @@ -53,7 +53,7 @@ def create_experiment_folder(root_path, model_name, debug): else: commit_hash = get_commit_hash() output_folder = os.path.join( - root_path, date_str + '-' + model_name + '-' + commit_hash) + root_path, model_name + '-' + date_str + '-' + commit_hash) os.makedirs(output_folder, exist_ok=True) print(" > Experiment folder: {}".format(output_folder)) return output_folder From b11f307b0e4f3a4e77fd897d420fca0341120376 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 17 Dec 2018 16:38:36 +0100 Subject: [PATCH 13/17] Logger field naming update for layer stats --- utils/logger.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/utils/logger.py b/utils/logger.py index c8cfcf28..2b1e262d 100644 --- a/utils/logger.py +++ b/utils/logger.py @@ -12,16 +12,16 @@ class Logger(object): layer_num = 1 for name, param in model.named_parameters(): self.writer.add_scalar( - "layer{}-ModelParams/{}/max".format(layer_num, name), + "layer{}-{}/max".format(layer_num, name), param.max(), step) self.writer.add_scalar( - "layer{}-ModelParams/{}/min".format(layer_num, name), + "layer{}-{}/min".format(layer_num, name), param.min(), step) self.writer.add_scalar( - "layer{}-ModelParams/{}/mean".format(layer_num, name), + "layer{}-{}/mean".format(layer_num, name), param.mean(), step) self.writer.add_scalar( - "layer{}-ModelParams/{}/std".format(layer_num, name), + "layer{}-{}/std".format(layer_num, name), param.std(), step) self.writer.add_histogram( "layer{}-{}/param".format(layer_num, name), param, step) From 048ebd187e210ca5fc9d4ba9dc975c7906dd38cd Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 18 Dec 2018 01:30:15 +0100 Subject: [PATCH 14/17] Readme update --- README.md | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1482cdab..1e800efd 100644 --- a/README.md +++ b/README.md @@ -55,11 +55,16 @@ Audio length is approximately 6 secs. |3.01|GTX1080Ti|60| -## Data -TTS provides a generic dataloder easy to use for new datasets. You need to write an adaptor to formatyour dataset.Check ```datasets/preprocess.py``` to see example adaptors. After your adaptor, you need to set ```dataset``` field in ```config.json``` accordingly. Some example datasets, we successfuly applied TTS, are linked below. +## Datasets and Data-Loading +TTS provides a generic dataloder easy to use for new datasets. You need to write an adaptor to format and that's all you need.Check ```datasets/preprocess.py``` to see example adaptors. After you wrote an adaptor, you need to set ```dataset``` field in ```config.json```. Do not forget other data related fields. + +You can also use pre-computed features. In this case, compute features with ```extract_features.py``` and set ```dataset``` field as ```tts_cache```. + +Example datasets, we successfully applied TTS, are linked below. - [LJ Speech](https://keithito.com/LJ-Speech-Dataset/) - [Nancy](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011/) +- [TWEB](http://https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset)\ ## Training and Fine-tuning LJ-Speech Split ```metadata.csv``` into train and validation subsets respectively ```metadata_train.csv``` and ```metadata_val.csv```. Note that having a validation split does not work well as oppose to other ML problems since at the validation time model generates spectrogram slices without "Teacher-Forcing" and that leads misalignment between the ground-truth and the prediction. Therefore, validation loss does not really show the model performance. Rather, you might use the all data for training and check the model performance by relying on human inspection. From 3c8eb517131a3543969774e61739abf294a308c7 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 18 Dec 2018 01:30:30 +0100 Subject: [PATCH 15/17] remove intermediate tensor asap --- layers/attention.py | 10 ++++------ layers/tacotron.py | 7 ++++++- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/layers/attention.py b/layers/attention.py index 534e4ba4..ea31768a 100644 --- a/layers/attention.py +++ b/layers/attention.py @@ -86,15 +86,15 @@ class LocationSensitiveAttention(nn.Module): if query.dim() == 2: # insert time-axis for broadcasting query = query.unsqueeze(1) - loc_conv = self.loc_conv(loc) - loc_conv = loc_conv.transpose(1, 2) - processed_loc = self.loc_linear(loc_conv) + processed_loc = self.loc_linear(self.loc_conv(loc).transpose(1, 2)) processed_query = self.query_layer(query) # cache annots if self.processed_annots is None: self.processed_annots = self.annot_layer(annot) alignment = self.v( torch.tanh(processed_query + self.processed_annots + processed_loc)) + del processed_loc + del processed_query # (batch, max_time) return alignment.squeeze(-1) @@ -138,11 +138,9 @@ class AttentionRNNCell(nn.Module): """ if t == 0: self.alignment_model.reset() - # Concat input query and previous context context - rnn_input = torch.cat((memory, context), -1) # Feed it to RNN # s_i = f(y_{i-1}, c_{i}, s_{i-1}) - rnn_output = self.rnn_cell(rnn_input, rnn_state) + rnn_output = self.rnn_cell(torch.cat((memory, context), -1), rnn_state) # Alignment # (batch, max_time) # e_{ij} = a(s_{i-1}, h_j) diff --git a/layers/tacotron.py b/layers/tacotron.py index e9d7d7ce..6b6cadb0 100644 --- a/layers/tacotron.py +++ b/layers/tacotron.py @@ -403,6 +403,7 @@ class Decoder(nn.Module): attention_rnn_hidden, current_context_vec, attention = self.attention_rnn( processed_memory, current_context_vec, attention_rnn_hidden, inputs, attention_cat, mask, t) + del attention_cat attention_cum += attention # Concat RNN output and attention context vector decoder_input = self.project_to_decoder_in( @@ -414,15 +415,19 @@ class Decoder(nn.Module): # Residual connectinon decoder_input = decoder_rnn_hiddens[idx] + decoder_input decoder_output = decoder_input + del decoder_input # predict mel vectors from decoder vectors output = self.proj_to_mel(decoder_output) output = torch.sigmoid(output) # predict stop token - stopnet_input = torch.cat([decoder_input, output], -1) + stopnet_input = torch.cat([decoder_output, output], -1) + del decoder_output stop_token = self.stopnet(stopnet_input) + del stopnet_input outputs += [output] attentions += [attention] stop_tokens += [stop_token] + del output t += 1 if memory is not None: if t >= T_decoder: From 7cdeef1b5cefd2c40a9391141cb9ef365a114234 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 18 Dec 2018 12:58:09 +0100 Subject: [PATCH 16/17] bug fixes --- train.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/train.py b/train.py index 7dd5b78d..f52d365d 100644 --- a/train.py +++ b/train.py @@ -20,6 +20,7 @@ from utils.generic_utils import ( from utils.visual import plot_alignment, plot_spectrogram from models.tacotron import Tacotron from layers.losses import L1LossMasked +from datasets.TTSDataset import MyDataset from utils.audio import AudioProcessor from utils.synthesis import synthesis from utils.logger import Logger @@ -44,8 +45,8 @@ def setup_loader(is_val=False): ap=ap, batch_group_size=0 if is_val else 8 * c.batch_size, min_seq_len=0 if is_val else c.min_seq_len, - max_seq_len=float("inf") if is_val else c.max_seq_len - cached=False if c.dataset ~= "tts_cache" else True) + max_seq_len=float("inf") if is_val else c.max_seq_len, + cached=False if c.dataset != "tts_cache" else True) loader = DataLoader( dataset, batch_size=c.eval_batch_size if is_val else c.batch_size, @@ -444,7 +445,7 @@ if __name__ == '__main__': default=False, help='Do not verify commit integrity to run training.') parser.add_argument( - '--data_path', type=str, default='', default='Defines the data path. It overwrites config.json.') + '--data_path', type=str, default='', help='Defines the data path. It overwrites config.json.') args = parser.parse_args() # setup output paths and read configs @@ -467,8 +468,6 @@ if __name__ == '__main__': # Conditional imports preprocessor = importlib.import_module('datasets.preprocess') preprocessor = getattr(preprocessor, c.dataset.lower()) - MyDataset = importlib.import_module('datasets.' + c.data_loader) - MyDataset = getattr(MyDataset, "MyDataset") audio = importlib.import_module('utils.' + c.audio['audio_processor']) AudioProcessor = getattr(audio, 'AudioProcessor') From 9d48430c3e722be6a33b617ea7d159b42d820df8 Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Tue, 18 Dec 2018 18:59:39 +0100 Subject: [PATCH 17/17] TWEB meta file in config --- config.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config.json b/config.json index b8ce9a90..f173c539 100644 --- a/config.json +++ b/config.json @@ -44,8 +44,8 @@ "run_eval": true, "data_path": "../../Data/LJSpeech-1.1/", // can overwritten from command argument - "meta_file_train": "transcript.txt", // metafile for training dataloader. - "meta_file_val": "", // metafile for evaluation dataloader. + "meta_file_train": "transcript_train.txt", // metafile for training dataloader. + "meta_file_val": "transcript_val.txt", // metafile for evaluation dataloader. "dataset": "tweb", // one of TTS.dataset.preprocessors depending on your target dataset. Use "tts_cache" for pre-computed dataset by extract_features.py "min_seq_len": 0, // minimum text length to use in training "max_seq_len": 300, // maximum text length