From 4587c72a03a93c402cb8b735612159b37ba0ee0d Mon Sep 17 00:00:00 2001
From: Eren Golge <egolge@mozilla.com>
Date: Mon, 17 Dec 2018 16:33:29 +0100
Subject: [PATCH] Add preprocessor for TWEB dataset

---
 datasets/preprocess.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/datasets/preprocess.py b/datasets/preprocess.py
index 89d464a0..67c184ef 100644
--- a/datasets/preprocess.py
+++ b/datasets/preprocess.py
@@ -13,10 +13,20 @@ def tts_cache(root_path, meta_file):
     return items            
 
 
-# def tweb(root_path, meta_file):
-#     # TODO
-#     pass
-#     return 
+def tweb(root_path, meta_file):
+    """Normalize TWEB dataset. 
+    https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset
+    """
+    txt_file = os.path.join(root_path, meta_file)
+    items = []
+    with open(txt_file, 'r') as ttf:
+        for line in ttf:
+            cols = line.split('\t')
+            wav_file = os.path.join(root_path, cols[0]+'.wav')
+            text = cols[1]
+            items.append([text, wav_file])
+    random.shuffle(items)
+    return items
     
 
 # def kusal(root_path, meta_file):