data-maker/data/maker/__init__.py

190 lines
6.4 KiB
Python

"""
(c) 2019 Data Maker, hiplab.mc.vanderbilt.edu
version 1.0.0
This package serves as a proxy to the overall usage of the framework.
This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques
@TODO:
- Make configurable GPU, EPOCHS
"""
import pandas as pd
import numpy as np
import data.gan as gan
from transport import factory
from data.bridge import Binary
import threading as thread
from data.maker import prepare
import copy
import os
import json
class ContinuousToDiscrete :
ROUND_UP = 2
@staticmethod
def binary(X,n=4) :
"""
This function will convert a continous stream of information into a variety a bit stream of bins
"""
values = np.array(X).astype(np.float32)
BOUNDS = ContinuousToDiscrete.bounds(values,n)
matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n)
@staticmethod
def bounds(x,n):
# return np.array_split(x,n)
values = np.round(x,ContinuousToDiscrete.ROUND_UP)
return list(pd.cut(values,n).categories)
@staticmethod
def continuous(X,BIN_SIZE=4) :
"""
This function will approximate a binary vector given boundary information
:X binary matrix
:BIN_SIZE
"""
BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
values = []
# _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
# # # print (BOUNDS)
l = {}
for i in np.arange(len(X)): #value in X :
value = X[i]
for item in BOUNDS :
if value >= item.left and value <= item.right :
values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)]
break
# values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ]
# # values = []
# for row in _BINARY :
# # ubound = BOUNDS[row.index(1)]
# index = np.where(row == 1)[0][0]
# ubound = BOUNDS[ index ].right
# lbound = BOUNDS[ index ].left
# x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)
# values.append(x_)
# lbound = ubound
# values = [np.random.uniform() for item in BOUNDS]
return values
def train (**_args):
"""
:params sql
:params store
"""
_inputhandler = prepare.Input(**_args)
values,_matrix = _inputhandler.convert()
args = {"real":_matrix,"context":_args['context']}
_map = {}
if 'store' in _args :
#
# This
args['store'] = copy.deepcopy(_args['store']['logs'])
args['store']['args']['doc'] = _args['context']
logger = factory.instance(**args['store'])
args['logger'] = logger
for key in _inputhandler._map :
beg = _inputhandler._map[key]['beg']
end = _inputhandler._map[key]['end']
values = _inputhandler._map[key]['values'].tolist()
_map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()}
info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map}
logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io})
args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
args ['max_epochs'] = _args['max_epochs']
args['matrix_size'] = _matrix.shape[0]
args['batch_size'] = 2000
if 'partition' in _args :
args['partition'] = _args['partition']
if 'gpu' in _args :
args['gpu'] = _args['gpu']
# os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
trainer = gan.Train(**args)
#
# @TODO: Write the map.json in the output directory for the logs
#
# f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w')
f = open(os.sep.join([trainer.out_dir,'map.json']),'w')
f.write(json.dumps(_map))
f.close()
trainer.apply()
pass
def get(**args):
"""
This function will restore a checkpoint from a persistant storage on to disk
"""
pass
def generate(**_args):
"""
This function will generate a set of records, before we must load the parameters needed
:param data
:param context
:param logs
"""
partition = _args['partition'] if 'partition' in _args else None
if not partition :
MAP_FLDER = os.sep.join([_args['logs'],'output',_args['context']])
# f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
else:
MAP_FOLDER = os.sep.join([_args['logs'],'output',_args['context'],str(partition)])
# f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json']))
f = open(os.sep.join([MAP_FOLDER,'map.json']))
_map = json.loads(f.read())
f.close()
#
#
# if 'file' in _args :
# df = pd.read_csv(_args['file'])
# else:
# df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])
args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']}
args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
args ['max_epochs'] = _args['max_epochs']
# args['matrix_size'] = _matrix.shape[0]
args['batch_size'] = 2000
args['partition'] = 0 if 'partition' not in _args else _args['partition']
args['row_count'] = _args['data'].shape[0]
#
# @TODO: perhaps get the space of values here ... (not sure it's a good idea)
#
_args['map'] = _map
_inputhandler = prepare.Input(**_args)
values,_matrix = _inputhandler.convert()
args['values'] = np.array(values)
if 'gpu' in _args :
args['gpu'] = _args['gpu']
handler = gan.Predict (**args)
lparams = {'columns':None}
if partition :
lparams['partition'] = partition
handler.load_meta(**lparams)
#
# Let us now format the matrices by reverting them to a data-frame with values
#
candidates = handler.apply(candidates=args['candidates'])
return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]