From d96690f83f5bc804f532b6c2d5cca208de401e7a Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Fri, 2 Nov 2018 17:27:31 +0100
Subject: [PATCH] Config updates and add sigmoid to mel network again

---
 config.json                  | 6 ++++--
 datasets/TTSDatasetCached.py | 5 +++--
 layers/tacotron.py           | 1 +
 utils/audio.py               | 6 +++---
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/config.json b/config.json
index 1491601d..b9214573 100644
--- a/config.json
+++ b/config.json
@@ -4,6 +4,7 @@
 
     "audio":{
         "audio_processor": "audio",     // to use dictate different audio processors, if available.
+        // Audio processing parameters
         "num_mels": 80,         // size of the mel spec frame. 
         "num_freq": 1025,       // number of stft frequency levels. Size of the linear spectogram frame.
         "sample_rate": 22050,   // wav sample-rate. If different than the original data, it is resampled.
@@ -14,6 +15,7 @@
         "ref_level_db": 20,     // reference level db, theoretically 20db is the sound of air.
         "power": 1.5,           // value to sharpen wav signals after GL algorithm.
         "griffin_lim_iters": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
+        // Normalization parameters
         "signal_norm": true,    // normalize the spec values in range [0, 1]
         "symmetric_norm": false, // move normalization to range [-1, 1]
         "max_norm": 1,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
@@ -22,7 +24,7 @@
         "mel_fmax": null        // maximum freq level for mel-spec. Tune for dataset!!
     },
 
-    "embedding_size": 256,
+    "embedding_size": 256,    
     "text_cleaner": "english_cleaners",
     "epochs": 1000,
     "lr": 0.0015,
@@ -36,7 +38,7 @@
     "print_step": 10,
 
     "run_eval": true,
-    "data_path": "../../Data/LJSpeech-1.1/tts_cache",  // can overwritten from command argument
+    "data_path": "../../Data/LJSpeech-1.1/",  // can overwritten from command argument
     "meta_file_train": "metadata_train.csv",      // metafile for training dataloader
     "meta_file_val": "metadata_val.csv",    // metafile for validation dataloader
     "data_loader": "TTSDataset",      // dataloader, ["TTSDataset", "TTSDatasetCached", "TTSDatasetMemory"]
diff --git a/datasets/TTSDatasetCached.py b/datasets/TTSDatasetCached.py
index 57a58f55..b5c6d4ce 100644
--- a/datasets/TTSDatasetCached.py
+++ b/datasets/TTSDatasetCached.py
@@ -24,6 +24,7 @@ class MyDataset(Dataset):
                  min_seq_len=0,
                  **kwargs
                  ):
+        self.ap = ap
         self.root_path = root_path
         self.batch_group_size = batch_group_size
         self.feat_dir = os.path.join(root_path, 'loader_data')
@@ -38,7 +39,7 @@ class MyDataset(Dataset):
 
     def load_wav(self, filename):
         try:
-            audio = librosa.core.load(filename, sr=self.sample_rate)
+            audio = self.ap.load_wav(filename)
             return audio
         except RuntimeError as e:
             print(" !! Cannot read file : {}".format(filename))
@@ -90,7 +91,7 @@ class MyDataset(Dataset):
         if wav_name.split('.')[-1] == 'npy':
             wav = self.load_np(wav_name)
         else:
-            wav = np.asarray(self.load_wav(wav_name)[0], dtype=np.float32)
+            wav = np.asarray(self.load_wav(wav_name), dtype=np.float32)
         mel = self.load_np(mel_name)
         linear = self.load_np(linear_name)
         sample = {
diff --git a/layers/tacotron.py b/layers/tacotron.py
index 749d7cb3..83d0d28b 100644
--- a/layers/tacotron.py
+++ b/layers/tacotron.py
@@ -416,6 +416,7 @@ class Decoder(nn.Module):
             decoder_output = decoder_input
             # predict mel vectors from decoder vectors
             output = self.proj_to_mel(decoder_output)
+            output = torch.sigmoid(output)
             # predict stop token
             stopnet_input = torch.cat([decoder_input, output], -1)
             stop_token = self.stopnet(stopnet_input)
diff --git a/utils/audio.py b/utils/audio.py
index a82eaeba..8b961099 100644
--- a/utils/audio.py
+++ b/utils/audio.py
@@ -48,11 +48,10 @@ class AudioProcessor(object):
         self.max_norm = 1.0 if max_norm is None else float(max_norm)
         self.clip_norm = clip_norm
         self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
-        if preemphasis == 0:
-            print(" | > Preemphasis is deactive.")
         print(" | > Audio Processor attributes.")
         members = vars(self)
-        pprint(members)
+        for key, value in members.items():
+            print("   | > {}:{}".format(key, value))
 
     def save_wav(self, wav, path):
         wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
@@ -226,6 +225,7 @@ class AudioProcessor(object):
 
     def load_wav(self, filename, encode=False):
         x, sr = librosa.load(filename, sr=self.sample_rate)
+        # sr, x = io.wavfile.read(filename)
         assert self.sample_rate == sr
         return x