From 0f0bde935ceec25f93117ff6d507f19162101041 Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Fri, 23 Nov 2018 16:58:26 +0100
Subject: [PATCH] trim silence if enabled

---
 config.json    |  3 ++-
 utils/audio.py | 11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/config.json b/config.json
index e166a750..9d66b45d 100644
--- a/config.json
+++ b/config.json
@@ -21,7 +21,8 @@
         "max_norm": 1,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
         "clip_norm": true,      // clip normalized values into the range.
         "mel_fmin": null,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
-        "mel_fmax": null        // maximum freq level for mel-spec. Tune for dataset!!
+        "mel_fmax": null,        // maximum freq level for mel-spec. Tune for dataset!!
+        "do_trim_silence": true  // enable trimming of slience of audio as you load it.
     },
 
     "embedding_size": 256,    
diff --git a/utils/audio.py b/utils/audio.py
index 8b961099..da54d6c8 100644
--- a/utils/audio.py
+++ b/utils/audio.py
@@ -26,6 +26,7 @@ class AudioProcessor(object):
                  mel_fmax=None,
                  clip_norm=True,
                  griffin_lim_iters=None,
+                 do_trim_silence=False
                  **kwargs):
 
         print(" > Setting up Audio Processor...")
@@ -47,6 +48,7 @@ class AudioProcessor(object):
         self.mel_fmax = mel_fmax
         self.max_norm = 1.0 if max_norm is None else float(max_norm)
         self.clip_norm = clip_norm
+        self.do_trim_silence = do_trim_silence
         self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
         print(" | > Audio Processor attributes.")
         members = vars(self)
@@ -203,6 +205,13 @@ class AudioProcessor(object):
                 return x + hop_length
         return len(wav)
 
+    def trim_silence(self, wav):
+        """ Trim silent parts with a threshold and 0.1 sec margin """
+        margin = int(self.sample_rate * 0.1)
+        wav = wav[margin:-margin]
+        return librosa.effects.trim(
+            wav, top_db=40, frame_length=1024, hop_length=256)[0]
+
     # WaveRNN repo specific functions
     # def mulaw_encode(self, wav, qc):
     #     mu = qc - 1
@@ -225,6 +234,8 @@ class AudioProcessor(object):
 
     def load_wav(self, filename, encode=False):
         x, sr = librosa.load(filename, sr=self.sample_rate)
+        if self.do_trim_silence:
+            x = self.ap.trim_silence(x)
         # sr, x = io.wavfile.read(filename)
         assert self.sample_rate == sr
         return x