From f7074608868aadd337d46f5e7ab35ff848a58c73 Mon Sep 17 00:00:00 2001 From: erogol Date: Tue, 18 Feb 2020 12:26:52 +0100 Subject: [PATCH] clearners update for special chars and conifg update --- config.json | 8 ++++---- notebooks/Benchmark-PWGAN.ipynb | 5 ++++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/config.json b/config.json index 8653d92f..a7ed04a3 100644 --- a/config.json +++ b/config.json @@ -39,7 +39,7 @@ "batch_size": 2, // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'. "eval_batch_size":16, "r": 7, // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled. - "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. + "gradual_training": [[0, 7, 64], [2000, 5, 64], [35000, 3, 32], [70000, 2, 32], [140000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed. "loss_masking": true, // enable / disable loss masking against the sequence padding. "grad_accum": 2, // if N > 1, enable gradient accumulation for N iterations. It is useful for low memory GPUs. @@ -93,10 +93,10 @@ "max_seq_len": 150, // DATASET-RELATED: maximum text length // PATHS - "output_path": "/data4/rw/home/Trainings/", + "output_path": "/home/erogol/Models/", // PHONEMES - "phoneme_cache_path": "mozilla_us_phonemes", // phoneme computation is slow, therefore, it caches results in the given folder. + "phoneme_cache_path": "mozilla_us_phonemes_2_1", // phoneme computation is slow, therefore, it caches results in the given folder. "use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation. "phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages @@ -110,7 +110,7 @@ [ { "name": "ljspeech", - "path": "/root/LJSpeech-1.1/", + "path": "/home/erogol/Data/LJSpeech-1.1/", "meta_file_train": "metadata.csv", "meta_file_val": null } diff --git a/notebooks/Benchmark-PWGAN.ipynb b/notebooks/Benchmark-PWGAN.ipynb index 430d329f..f17f71f4 100644 --- a/notebooks/Benchmark-PWGAN.ipynb +++ b/notebooks/Benchmark-PWGAN.ipynb @@ -85,7 +85,10 @@ " if use_cuda:\n", " waveform = waveform.cpu()\n", " waveform = waveform.numpy()\n", - " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n", + " print(waveform.shape)\n", + " print(\" > Run-time: {}\".format(time.time() - t_1))\n", + " print(\" > Real-time factor: {}\".format(rtf))\n", " if figures: \n", " visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec)) \n", " IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=False)) \n",