From f7074608868aadd337d46f5e7ab35ff848a58c73 Mon Sep 17 00:00:00 2001
From: erogol <erogol@hotmail.com>
Date: Tue, 18 Feb 2020 12:26:52 +0100
Subject: [PATCH] clearners update for special chars and conifg update

---
 config.json                     | 8 ++++----
 notebooks/Benchmark-PWGAN.ipynb | 5 ++++-
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/config.json b/config.json
index 8653d92f..a7ed04a3 100644
--- a/config.json
+++ b/config.json
@@ -39,7 +39,7 @@
     "batch_size": 2,       // Batch size for training. Lower values than 32 might cause hard to learn attention. It is overwritten by 'gradual_training'.
     "eval_batch_size":16,   
     "r": 7,                 // Number of decoder frames to predict per iteration. Set the initial values if gradual training is enabled.  
-    "gradual_training": [[0, 7, 64], [1, 5, 64], [50000, 3, 32], [130000, 2, 32], [290000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
+    "gradual_training": [[0, 7, 64], [2000, 5, 64], [35000, 3, 32], [70000, 2, 32], [140000, 1, 32]], //set gradual training steps [first_step, r, batch_size]. If it is null, gradual training is disabled. For Tacotron, you might need to reduce the 'batch_size' as you proceeed.
     "loss_masking": true,         // enable / disable loss masking against the sequence padding.
     "grad_accum": 2,        // if N > 1, enable gradient accumulation for N iterations. It is useful for low memory GPUs. 
 
@@ -93,10 +93,10 @@
     "max_seq_len": 150,     // DATASET-RELATED: maximum text length
 
     // PATHS
-    "output_path": "/data4/rw/home/Trainings/",
+    "output_path": "/home/erogol/Models/",
  
     // PHONEMES
-    "phoneme_cache_path": "mozilla_us_phonemes",  // phoneme computation is slow, therefore, it caches results in the given folder.
+    "phoneme_cache_path": "mozilla_us_phonemes_2_1",  // phoneme computation is slow, therefore, it caches results in the given folder.
     "use_phonemes": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.
     "phoneme_language": "en-us",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages
 
@@ -110,7 +110,7 @@
         [
             {
                 "name": "ljspeech",
-                "path": "/root/LJSpeech-1.1/",
+                "path": "/home/erogol/Data/LJSpeech-1.1/",
                 "meta_file_train": "metadata.csv",
                 "meta_file_val": null
             }
diff --git a/notebooks/Benchmark-PWGAN.ipynb b/notebooks/Benchmark-PWGAN.ipynb
index 430d329f..f17f71f4 100644
--- a/notebooks/Benchmark-PWGAN.ipynb
+++ b/notebooks/Benchmark-PWGAN.ipynb
@@ -85,7 +85,10 @@
     "    if use_cuda:\n",
     "        waveform = waveform.cpu()\n",
     "    waveform = waveform.numpy()\n",
-    "    print(\" >  Run-time: {}\".format(time.time() - t_1))\n",
+    "    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)\n",
+    "    print(waveform.shape)\n",
+    "    print(\" > Run-time: {}\".format(time.time() - t_1))\n",
+    "    print(\" > Real-time factor: {}\".format(rtf))\n",
     "    if figures:  \n",
     "        visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, ap._denormalize(mel_spec))                                                                       \n",
     "    IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate'], normalize=False))  \n",