diff --git a/datasets/preprocess.py b/datasets/preprocess.py index 89d464a0..67c184ef 100644 --- a/datasets/preprocess.py +++ b/datasets/preprocess.py @@ -13,10 +13,20 @@ def tts_cache(root_path, meta_file): return items -# def tweb(root_path, meta_file): -# # TODO -# pass -# return +def tweb(root_path, meta_file): + """Normalize TWEB dataset. + https://www.kaggle.com/bryanpark/the-world-english-bible-speech-dataset + """ + txt_file = os.path.join(root_path, meta_file) + items = [] + with open(txt_file, 'r') as ttf: + for line in ttf: + cols = line.split('\t') + wav_file = os.path.join(root_path, cols[0]+'.wav') + text = cols[1] + items.append([text, wav_file]) + random.shuffle(items) + return items # def kusal(root_path, meta_file):