data-maker/data/maker/__init__.py

301 lines
11 KiB
Python
Raw Normal View History

"""
(c) 2019 Data Maker, hiplab.mc.vanderbilt.edu
version 1.0.0
This package serves as a proxy to the overall usage of the framework.
This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques
@TODO:
- Make configurable GPU, EPOCHS
"""
import pandas as pd
import numpy as np
2020-01-05 05:02:15 +00:00
import data.gan as gan
2020-01-04 03:47:05 +00:00
from transport import factory
2020-02-18 08:59:39 +00:00
from data.bridge import Binary
2020-02-11 18:00:16 +00:00
import threading as thread
2021-03-29 16:10:57 +00:00
from data.maker import prepare
import copy
import os
import json
2020-02-29 03:37:26 +00:00
class ContinuousToDiscrete :
ROUND_UP = 2
2020-02-29 03:37:26 +00:00
@staticmethod
def binary(X,n=4) :
"""
This function will convert a continous stream of information into a variety a bit stream of bins
"""
2020-03-12 19:37:01 +00:00
values = np.array(X).astype(np.float32)
2020-03-12 14:41:54 +00:00
BOUNDS = ContinuousToDiscrete.bounds(values,n)
2020-04-01 05:21:51 +00:00
matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n)
2020-02-29 03:37:26 +00:00
@staticmethod
def bounds(x,n):
2020-03-07 15:16:17 +00:00
# return np.array_split(x,n)
2020-03-12 14:41:54 +00:00
values = np.round(x,ContinuousToDiscrete.ROUND_UP)
return list(pd.cut(values,n).categories)
2020-02-29 03:37:26 +00:00
@staticmethod
def continuous(X,BIN_SIZE=4) :
"""
This function will approximate a binary vector given boundary information
:X binary matrix
:BIN_SIZE
"""
BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
values = []
# _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
# # # print (BOUNDS)
l = {}
2020-04-01 05:21:51 +00:00
for i in np.arange(len(X)): #value in X :
value = X[i]
2020-04-01 05:21:51 +00:00
for item in BOUNDS :
if value >= item.left and value <= item.right :
values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)]
break
# values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ]
2020-02-29 03:37:26 +00:00
# # values = []
# for row in _BINARY :
# # ubound = BOUNDS[row.index(1)]
# index = np.where(row == 1)[0][0]
# ubound = BOUNDS[ index ].right
# lbound = BOUNDS[ index ].left
2020-02-29 03:37:26 +00:00
# x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)
# values.append(x_)
2020-02-29 03:37:26 +00:00
# lbound = ubound
# values = [np.random.uniform() for item in BOUNDS]
2020-02-29 03:37:26 +00:00
return values
2021-03-29 16:10:57 +00:00
def train (**_args):
"""
:params sql
:params store
"""
#
# Let us prepare the data by calling the utility function
#
2021-03-30 09:56:01 +00:00
# if 'file' in _args :
# #
# # We are reading data from a file
# _args['data'] = pd.read_csv(_args['file'])
# else:
# #
# # data will be read from elsewhere (a data-store)...
# pass
2021-03-29 16:10:57 +00:00
# if 'ignore' in _args and 'columns' in _args['ignore']:
2021-03-30 09:56:01 +00:00
2021-03-29 16:10:57 +00:00
_inputhandler = prepare.Input(**_args)
values,_matrix = _inputhandler.convert()
args = {"real":_matrix,"context":_args['context']}
_map = {}
if 'store' in _args :
#
# This
2021-03-30 09:56:01 +00:00
2021-03-29 16:10:57 +00:00
args['store'] = copy.deepcopy(_args['store']['logs'])
args['store']['args']['doc'] = _args['context']
logger = factory.instance(**args['store'])
args['logger'] = logger
for key in _inputhandler._map :
beg = _inputhandler._map[key]['beg']
end = _inputhandler._map[key]['end']
values = _inputhandler._map[key]['values'].tolist()
_map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()}
info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map}
2021-03-30 21:14:48 +00:00
logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io})
2021-03-29 16:10:57 +00:00
args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
args ['max_epochs'] = _args['max_epochs']
args['matrix_size'] = _matrix.shape[0]
args['batch_size'] = 2000
args['partition'] = 0 if 'partition' not in _args else _args['partition']
os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
2020-02-29 03:37:26 +00:00
2021-03-29 16:10:57 +00:00
trainer = gan.Train(**args)
#
# @TODO: Write the map.json in the output directory for the logs
#
f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w')
f.write(json.dumps(_map))
f.close()
trainer.apply()
pass
def _train (**args) :
"""
This function is intended to train the GAN in order to learn about the distribution of the features
:column columns that need to be synthesized (discrete)
:logs where the output of the (location on disk)
:id identifier of the dataset
:data data-frame to be synthesized
:context label of what we are synthesizing
"""
2020-02-11 18:00:16 +00:00
column = args['column'] if (isinstance(args['column'],list)) else [args['column']]
2020-03-12 19:37:01 +00:00
# CONTINUOUS = args['continuous'] if 'continuous' in args else []
# column_id = args['id']
2020-01-10 19:12:58 +00:00
df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
2020-02-11 18:00:16 +00:00
df.columns = [name.lower() for name in df.columns]
2020-02-29 03:37:26 +00:00
#
# @TODO:
# Consider sequential training of sub population for extremely large datasets
#
2020-02-11 18:00:16 +00:00
#
# If we have several columns we will proceed one at a time (it could be done in separate threads)
# @TODO : Consider performing this task on several threads/GPUs simulataneously
#
2020-02-29 03:37:26 +00:00
for col in column :
2020-04-15 14:18:06 +00:00
msize = args['matrix_size'] if 'matrix_size' in args else -1
args['real'] = (Binary()).apply(df[col],msize)
2020-03-08 13:48:38 +00:00
2020-02-11 18:00:16 +00:00
context = args['context']
if 'store' in args :
args['store']['args']['doc'] = context
logger = factory.instance(**args['store'])
args['logger'] = logger
2020-03-09 00:33:08 +00:00
info = {"rows":args['real'].shape[0],"cols":args['real'].shape[1],"name":col,"partition":args['partition']}
2020-03-08 13:48:38 +00:00
logger.write({"module":"gan-train","action":"data-prep","input":info})
2020-02-11 18:00:16 +00:00
else:
logger = None
2020-03-08 13:48:38 +00:00
args['column'] = col
args['context'] = col
#
# If the s
2020-02-11 18:00:16 +00:00
trainer = gan.Train(**args)
trainer.apply()
def get(**args):
2020-02-11 18:00:16 +00:00
"""
This function will restore a checkpoint from a persistant storage on to disk
"""
pass
2021-03-29 16:10:57 +00:00
def generate(**_args):
"""
This function will generate a set of records, before we must load the parameters needed
:param data
:param context
:param logs
"""
f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
_map = json.loads(f.read())
f.close()
2021-03-30 14:00:57 +00:00
# if 'file' in _args :
# df = pd.read_csv(_args['file'])
# else:
# df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])
2021-03-29 16:10:57 +00:00
args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']}
args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
args ['max_epochs'] = _args['max_epochs']
# args['matrix_size'] = _matrix.shape[0]
args['batch_size'] = 2000
args['partition'] = 0 if 'partition' not in _args else _args['partition']
2021-03-30 14:00:57 +00:00
args['row_count'] = _args['data'].shape[0]
2021-03-29 16:10:57 +00:00
#
# @TODO: perhaps get the space of values here ... (not sure it's a good idea)
#
_args['map'] = _map
_inputhandler = prepare.Input(**_args)
values,_matrix = _inputhandler.convert()
args['values'] = np.array(values)
if 'gpu' in _args :
os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
handler = gan.Predict (**args)
handler.load_meta(None)
#
2021-03-29 23:53:57 +00:00
# Let us now format the matrices by reverting them to a data-frame with values
2021-03-29 16:10:57 +00:00
#
candidates = handler.apply(candidates=args['candidates'])
return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]
def _generate(**args):
"""
This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
@return pandas.DataFrame
:data data-frame to be synthesized
:column columns that need to be synthesized (discrete)
:id column identifying an entity
:logs location on disk where the learnt knowledge of the dataset is
"""
2020-01-10 19:12:58 +00:00
# df = args['data']
df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
2020-03-08 13:48:38 +00:00
2020-03-09 01:27:27 +00:00
CONTINUOUS = args['continuous'] if 'continuous' in args else []
2020-02-11 18:00:16 +00:00
column = args['column'] if (isinstance(args['column'],list)) else [args['column']]
# column_id = args['id']
#
#@TODO:
# If the identifier is not present, we should fine a way to determine or make one
#
2020-03-12 19:37:01 +00:00
BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
# NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
bhandler = Binary()
2020-02-11 18:00:16 +00:00
_df = df.copy()
for col in column :
args['context'] = col
args['column'] = col
2020-02-29 03:37:26 +00:00
2020-04-15 14:18:06 +00:00
msize = args['matrix_size'] if 'matrix_size' in args else -1
values = bhandler.get_column(df[col],msize)
MISSING= bhandler.get_missing(df[col],msize)
2020-03-08 13:48:38 +00:00
2020-02-29 03:37:26 +00:00
2020-03-12 19:37:01 +00:00
2020-02-29 03:37:26 +00:00
args['values'] = values
args['row_count'] = df.shape[0]
# if col in NO_VALUE :
# args['no_value'] = NO_VALUE[col]
# else:
# args['no_value'] = NO_VALUE
# novalue = NO_VALUE[col] if NO_VALUE[col] in ['na',''] else NO_VALUE[col]
# MISSING += [NO_VALUE[col]]
args['missing'] = MISSING
2020-02-11 18:00:16 +00:00
#
# we can determine the cardinalities here so we know what to allow or disallow
handler = gan.Predict (**args)
2020-02-11 18:00:16 +00:00
handler.load_meta(col)
r = handler.apply()
if col in CONTINUOUS :
r[col] = np.array(r[col])
_approx = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) #-- approximating based on arbitrary bins
r[col] = _approx
2020-04-01 05:21:51 +00:00
_df[col] = r[col]
2020-02-29 03:37:26 +00:00
#
# Let's cast the type to the original type (it makes the data more usable)
#
# print (values)
# print ([col,df[col].dtype,_df[col].tolist()])
otype = df[col].dtype
_df[col] = _df[col].astype(otype)
#
2020-02-29 03:37:26 +00:00
# @TODO: log basic stats about the synthetic attribute
#
# print (r)s
2020-02-11 18:00:16 +00:00
# break
return _df