data-maker/data/maker/__init__.py

189 lines
6.4 KiB
Python
Raw Normal View History

"""
(c) 2019 Data Maker, hiplab.mc.vanderbilt.edu
version 1.0.0
This package serves as a proxy to the overall usage of the framework.
This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques
@TODO:
- Make configurable GPU, EPOCHS
"""
import pandas as pd
import numpy as np
2020-01-05 05:02:15 +00:00
import data.gan as gan
2020-01-04 03:47:05 +00:00
from transport import factory
2020-02-18 08:59:39 +00:00
from data.bridge import Binary
2020-02-11 18:00:16 +00:00
import threading as thread
2021-03-29 16:10:57 +00:00
from data.maker import prepare
import copy
import os
import json
2020-02-29 03:37:26 +00:00
class ContinuousToDiscrete :
ROUND_UP = 2
2020-02-29 03:37:26 +00:00
@staticmethod
def binary(X,n=4) :
"""
This function will convert a continous stream of information into a variety a bit stream of bins
"""
2020-03-12 19:37:01 +00:00
values = np.array(X).astype(np.float32)
2020-03-12 14:41:54 +00:00
BOUNDS = ContinuousToDiscrete.bounds(values,n)
2020-04-01 05:21:51 +00:00
matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n)
2020-02-29 03:37:26 +00:00
@staticmethod
def bounds(x,n):
2020-03-07 15:16:17 +00:00
# return np.array_split(x,n)
2020-03-12 14:41:54 +00:00
values = np.round(x,ContinuousToDiscrete.ROUND_UP)
return list(pd.cut(values,n).categories)
2020-02-29 03:37:26 +00:00
@staticmethod
def continuous(X,BIN_SIZE=4) :
"""
This function will approximate a binary vector given boundary information
:X binary matrix
:BIN_SIZE
"""
BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
values = []
# _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
# # # print (BOUNDS)
l = {}
2020-04-01 05:21:51 +00:00
for i in np.arange(len(X)): #value in X :
value = X[i]
2020-04-01 05:21:51 +00:00
for item in BOUNDS :
if value >= item.left and value <= item.right :
values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)]
break
# values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ]
2020-02-29 03:37:26 +00:00
# # values = []
# for row in _BINARY :
# # ubound = BOUNDS[row.index(1)]
# index = np.where(row == 1)[0][0]
# ubound = BOUNDS[ index ].right
# lbound = BOUNDS[ index ].left
2020-02-29 03:37:26 +00:00
# x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)
# values.append(x_)
2020-02-29 03:37:26 +00:00
# lbound = ubound
# values = [np.random.uniform() for item in BOUNDS]
2020-02-29 03:37:26 +00:00
return values
2021-03-29 16:10:57 +00:00
def train (**_args):
"""
:params sql
:params store
"""
2021-03-30 09:56:01 +00:00
2021-03-29 16:10:57 +00:00
_inputhandler = prepare.Input(**_args)
values,_matrix = _inputhandler.convert()
args = {"real":_matrix,"context":_args['context']}
_map = {}
if 'store' in _args :
#
# This
2021-03-30 09:56:01 +00:00
2021-03-29 16:10:57 +00:00
args['store'] = copy.deepcopy(_args['store']['logs'])
args['store']['args']['doc'] = _args['context']
logger = factory.instance(**args['store'])
args['logger'] = logger
for key in _inputhandler._map :
beg = _inputhandler._map[key]['beg']
end = _inputhandler._map[key]['end']
values = _inputhandler._map[key]['values'].tolist()
_map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()}
info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map}
2021-03-30 21:14:48 +00:00
logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io})
2021-03-29 16:10:57 +00:00
args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
args ['max_epochs'] = _args['max_epochs']
args['matrix_size'] = _matrix.shape[0]
args['batch_size'] = 2000
if 'partition' in _args :
args['partition'] = _args['partition']
2021-04-01 18:20:35 +00:00
if 'gpu' in _args :
args['gpu'] = _args['gpu']
2021-04-01 18:09:06 +00:00
# os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
2020-02-29 03:37:26 +00:00
2021-03-29 16:10:57 +00:00
trainer = gan.Train(**args)
#
# @TODO: Write the map.json in the output directory for the logs
#
# f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w')
f = open(os.sep.join([trainer.out_dir,'map.json']),'w')
2021-03-29 16:10:57 +00:00
f.write(json.dumps(_map))
f.close()
trainer.apply()
pass
2020-03-08 13:48:38 +00:00
def get(**args):
2020-02-11 18:00:16 +00:00
"""
This function will restore a checkpoint from a persistant storage on to disk
"""
pass
2021-03-29 16:10:57 +00:00
def generate(**_args):
"""
This function will generate a set of records, before we must load the parameters needed
:param data
:param context
:param logs
"""
partition = _args['partition'] if 'partition' in _args else None
if not partition :
2021-05-10 19:33:18 +00:00
LOG_DIR = os.sep.join([_args['logs'],'output',_args['context']])
# f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
else:
2021-05-10 19:33:18 +00:00
LOG_DIR = os.sep.join([_args['logs'],'output',_args['context'],str(partition)])
# f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json']))
f = open(os.sep.join([LOG_DIR,'map.json']))
2021-03-29 16:10:57 +00:00
_map = json.loads(f.read())
f.close()
2021-03-30 14:00:57 +00:00
# if 'file' in _args :
# df = pd.read_csv(_args['file'])
# else:
# df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])
2021-03-29 16:10:57 +00:00
args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']}
2021-05-10 19:33:18 +00:00
args['logs'] = LOG_DIR if 'logs' in _args else 'logs'
2021-03-29 16:10:57 +00:00
args ['max_epochs'] = _args['max_epochs']
# args['matrix_size'] = _matrix.shape[0]
args['batch_size'] = 2000
args['partition'] = 0 if 'partition' not in _args else _args['partition']
2021-03-30 14:00:57 +00:00
args['row_count'] = _args['data'].shape[0]
2021-03-29 16:10:57 +00:00
#
# @TODO: perhaps get the space of values here ... (not sure it's a good idea)
#
_args['map'] = _map
_inputhandler = prepare.Input(**_args)
values,_matrix = _inputhandler.convert()
args['values'] = np.array(values)
2021-04-01 18:20:35 +00:00
if 'gpu' in _args :
args['gpu'] = _args['gpu']
2021-04-01 18:09:06 +00:00
2021-03-29 16:10:57 +00:00
handler = gan.Predict (**args)
2021-05-10 19:43:29 +00:00
lparams = {'columns':None}
if partition :
lparams['partition'] = partition
2021-05-10 19:49:08 +00:00
handler.load_meta(**lparams)
2021-03-29 16:10:57 +00:00
#
2021-03-29 23:53:57 +00:00
# Let us now format the matrices by reverting them to a data-frame with values
2021-03-29 16:10:57 +00:00
#
candidates = handler.apply(candidates=args['candidates'])
return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]