From 4587c72a03a93c402cb8b735612159b37ba0ee0d Mon Sep 17 00:00:00 2001 From: Eren Golge Date: Mon, 17 Dec 2018 16:33:29 +0100 Subject: [PATCH] Add preprocessor for TWEB dataset --- datasets/preprocess.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/datasets/preprocess.py b/datasets/preprocess.py index 89d464a0..67c184ef 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -13,10 +13,20 @@ def tts_cache(root_path, meta_file): return items -# def tweb(root_path, meta_file): -# # TODO -# pass -# return +def tweb(root_path, meta_file): + """Normalize TWEB dataset. + https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset + """ + txt_file = os.path.join(root_path, meta_file) + items = [] + with open(txt_file, 'r') as ttf: + for line in ttf: + cols = line.split('\t') + wav_file = os.path.join(root_path, cols[0]+'.wav') + text = cols[1] + items.append([text, wav_file]) + random.shuffle(items) + return items # def kusal(root_path, meta_file):