2020-01-01 05:27:53 +00:00
"""
( c ) 2019 Data Maker , hiplab . mc . vanderbilt . edu
version 1.0 .0
This package serves as a proxy to the overall usage of the framework .
This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques
@TODO :
- Make configurable GPU , EPOCHS
"""
import pandas as pd
import numpy as np
2020-01-05 05:02:15 +00:00
import data . gan as gan
2020-01-04 03:47:05 +00:00
from transport import factory
2020-02-18 08:59:39 +00:00
from data . bridge import Binary
2020-02-11 18:00:16 +00:00
import threading as thread
2021-03-29 16:10:57 +00:00
from data . maker import prepare
import copy
import os
import json
2020-02-29 03:37:26 +00:00
class ContinuousToDiscrete :
2020-03-04 17:49:18 +00:00
ROUND_UP = 2
2020-02-29 03:37:26 +00:00
@staticmethod
def binary ( X , n = 4 ) :
"""
This function will convert a continous stream of information into a variety a bit stream of bins
"""
2020-03-12 19:37:01 +00:00
values = np . array ( X ) . astype ( np . float32 )
2020-03-12 14:41:54 +00:00
BOUNDS = ContinuousToDiscrete . bounds ( values , n )
2020-04-01 05:21:51 +00:00
matrix = np . repeat ( np . zeros ( n ) , len ( X ) ) . reshape ( len ( X ) , n )
2020-02-29 03:37:26 +00:00
@staticmethod
def bounds ( x , n ) :
2020-03-07 15:16:17 +00:00
# return np.array_split(x,n)
2020-03-12 14:41:54 +00:00
values = np . round ( x , ContinuousToDiscrete . ROUND_UP )
return list ( pd . cut ( values , n ) . categories )
2020-02-29 03:37:26 +00:00
@staticmethod
def continuous ( X , BIN_SIZE = 4 ) :
"""
This function will approximate a binary vector given boundary information
: X binary matrix
: BIN_SIZE
"""
BOUNDS = ContinuousToDiscrete . bounds ( X , BIN_SIZE )
values = [ ]
2020-03-25 22:43:23 +00:00
# _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
# # # print (BOUNDS)
l = { }
2020-04-01 05:21:51 +00:00
for i in np . arange ( len ( X ) ) : #value in X :
value = X [ i ]
2020-03-25 22:43:23 +00:00
2020-04-01 05:21:51 +00:00
for item in BOUNDS :
if value > = item . left and value < = item . right :
values + = [ np . round ( np . random . uniform ( item . left , item . right ) , ContinuousToDiscrete . ROUND_UP ) ]
break
# values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ]
2020-02-29 03:37:26 +00:00
2020-03-25 22:43:23 +00:00
# # values = []
# for row in _BINARY :
# # ubound = BOUNDS[row.index(1)]
# index = np.where(row == 1)[0][0]
# ubound = BOUNDS[ index ].right
# lbound = BOUNDS[ index ].left
2020-02-29 03:37:26 +00:00
2020-03-25 22:43:23 +00:00
# x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)
# values.append(x_)
2020-02-29 03:37:26 +00:00
2020-03-25 22:43:23 +00:00
# lbound = ubound
# values = [np.random.uniform() for item in BOUNDS]
2020-02-29 03:37:26 +00:00
return values
2021-03-29 16:10:57 +00:00
def train ( * * _args ) :
"""
: params sql
: params store
"""
#
# Let us prepare the data by calling the utility function
#
if ' file ' in _args :
#
# We are reading data from a file
_args [ ' data ' ] = pd . read_csv ( _args [ ' file ' ] )
else :
#
# data will be read from elsewhere (a data-store)...
pass
# if 'ignore' in _args and 'columns' in _args['ignore']:
_inputhandler = prepare . Input ( * * _args )
values , _matrix = _inputhandler . convert ( )
args = { " real " : _matrix , " context " : _args [ ' context ' ] }
_map = { }
if ' store ' in _args :
#
# This
args [ ' store ' ] = copy . deepcopy ( _args [ ' store ' ] [ ' logs ' ] )
args [ ' store ' ] [ ' args ' ] [ ' doc ' ] = _args [ ' context ' ]
logger = factory . instance ( * * args [ ' store ' ] )
args [ ' logger ' ] = logger
for key in _inputhandler . _map :
beg = _inputhandler . _map [ key ] [ ' beg ' ]
end = _inputhandler . _map [ key ] [ ' end ' ]
values = _inputhandler . _map [ key ] [ ' values ' ] . tolist ( )
_map [ key ] = { " beg " : beg , " end " : end , " values " : np . array ( values ) . astype ( str ) . tolist ( ) }
info = { " rows " : _matrix . shape [ 0 ] , " cols " : _matrix . shape [ 1 ] , " map " : _map }
logger . write ( { " module " : " gan-train " , " action " : " data-prep " , " context " : _args [ ' context ' ] , " input " : info } )
args [ ' logs ' ] = _args [ ' logs ' ] if ' logs ' in _args else ' logs '
args [ ' max_epochs ' ] = _args [ ' max_epochs ' ]
args [ ' matrix_size ' ] = _matrix . shape [ 0 ]
args [ ' batch_size ' ] = 2000
args [ ' partition ' ] = 0 if ' partition ' not in _args else _args [ ' partition ' ]
os . environ [ ' CUDA_VISIBLE_DEVICES ' ] = str ( args [ ' gpu ' ] ) if ' gpu ' in args else ' 0 '
2020-02-29 03:37:26 +00:00
2021-03-29 16:10:57 +00:00
trainer = gan . Train ( * * args )
#
# @TODO: Write the map.json in the output directory for the logs
#
f = open ( os . sep . join ( [ _args [ ' logs ' ] , ' output ' , _args [ ' context ' ] , ' map.json ' ] ) , ' w ' )
f . write ( json . dumps ( _map ) )
f . close ( )
trainer . apply ( )
pass
def _train ( * * args ) :
2020-01-01 05:27:53 +00:00
"""
This function is intended to train the GAN in order to learn about the distribution of the features
: column columns that need to be synthesized ( discrete )
: logs where the output of the ( location on disk )
: id identifier of the dataset
: data data - frame to be synthesized
: context label of what we are synthesizing
"""
2020-02-11 18:00:16 +00:00
column = args [ ' column ' ] if ( isinstance ( args [ ' column ' ] , list ) ) else [ args [ ' column ' ] ]
2020-03-12 19:37:01 +00:00
# CONTINUOUS = args['continuous'] if 'continuous' in args else []
2020-02-18 18:25:47 +00:00
# column_id = args['id']
2020-01-10 19:12:58 +00:00
df = args [ ' data ' ] if not isinstance ( args [ ' data ' ] , str ) else pd . read_csv ( args [ ' data ' ] )
2020-02-11 18:00:16 +00:00
df . columns = [ name . lower ( ) for name in df . columns ]
2020-02-29 03:37:26 +00:00
#
# @TODO:
# Consider sequential training of sub population for extremely large datasets
#
2020-02-11 18:00:16 +00:00
#
# If we have several columns we will proceed one at a time (it could be done in separate threads)
# @TODO : Consider performing this task on several threads/GPUs simulataneously
#
2020-02-29 03:37:26 +00:00
for col in column :
2020-04-15 14:18:06 +00:00
msize = args [ ' matrix_size ' ] if ' matrix_size ' in args else - 1
2020-04-14 20:14:38 +00:00
args [ ' real ' ] = ( Binary ( ) ) . apply ( df [ col ] , msize )
2020-03-08 13:48:38 +00:00
2020-02-11 18:00:16 +00:00
context = args [ ' context ' ]
if ' store ' in args :
args [ ' store ' ] [ ' args ' ] [ ' doc ' ] = context
logger = factory . instance ( * * args [ ' store ' ] )
args [ ' logger ' ] = logger
2020-03-09 00:33:08 +00:00
info = { " rows " : args [ ' real ' ] . shape [ 0 ] , " cols " : args [ ' real ' ] . shape [ 1 ] , " name " : col , " partition " : args [ ' partition ' ] }
2020-03-08 13:48:38 +00:00
logger . write ( { " module " : " gan-train " , " action " : " data-prep " , " input " : info } )
2020-02-11 18:00:16 +00:00
else :
logger = None
2020-03-08 13:48:38 +00:00
args [ ' column ' ] = col
args [ ' context ' ] = col
#
# If the s
2020-02-11 18:00:16 +00:00
trainer = gan . Train ( * * args )
trainer . apply ( )
2020-02-12 18:43:30 +00:00
def get ( * * args ) :
2020-02-11 18:00:16 +00:00
"""
This function will restore a checkpoint from a persistant storage on to disk
"""
pass
2021-03-29 16:10:57 +00:00
def generate ( * * _args ) :
"""
This function will generate a set of records , before we must load the parameters needed
: param data
: param context
: param logs
"""
f = open ( os . sep . join ( [ _args [ ' logs ' ] , ' output ' , _args [ ' context ' ] , ' map.json ' ] ) )
_map = json . loads ( f . read ( ) )
f . close ( )
if ' file ' in _args :
df = pd . read_csv ( _args [ ' file ' ] )
else :
df = _args [ ' data ' ] if not isinstance ( _args [ ' data ' ] , str ) else pd . read_csv ( _args [ ' data ' ] )
args = { " context " : _args [ ' context ' ] , " max_epochs " : _args [ ' max_epochs ' ] , " candidates " : _args [ ' candidates ' ] }
args [ ' logs ' ] = _args [ ' logs ' ] if ' logs ' in _args else ' logs '
args [ ' max_epochs ' ] = _args [ ' max_epochs ' ]
# args['matrix_size'] = _matrix.shape[0]
args [ ' batch_size ' ] = 2000
args [ ' partition ' ] = 0 if ' partition ' not in _args else _args [ ' partition ' ]
args [ ' row_count ' ] = df . shape [ 0 ]
#
# @TODO: perhaps get the space of values here ... (not sure it's a good idea)
#
_args [ ' map ' ] = _map
_inputhandler = prepare . Input ( * * _args )
values , _matrix = _inputhandler . convert ( )
args [ ' values ' ] = np . array ( values )
if ' gpu ' in _args :
os . environ [ ' CUDA_VISIBLE_DEVICES ' ] = str ( _args [ ' gpu ' ] )
handler = gan . Predict ( * * args )
handler . load_meta ( None )
#
2021-03-29 23:53:57 +00:00
# Let us now format the matrices by reverting them to a data-frame with values
2021-03-29 16:10:57 +00:00
#
candidates = handler . apply ( candidates = args [ ' candidates ' ] )
return [ _inputhandler . revert ( matrix = _matrix ) for _matrix in candidates ]
def _generate ( * * args ) :
2020-01-01 05:27:53 +00:00
"""
This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
@return pandas . DataFrame
: data data - frame to be synthesized
: column columns that need to be synthesized ( discrete )
: id column identifying an entity
: logs location on disk where the learnt knowledge of the dataset is
"""
2020-01-10 19:12:58 +00:00
# df = args['data']
df = args [ ' data ' ] if not isinstance ( args [ ' data ' ] , str ) else pd . read_csv ( args [ ' data ' ] )
2020-03-08 13:48:38 +00:00
2020-03-09 01:27:27 +00:00
CONTINUOUS = args [ ' continuous ' ] if ' continuous ' in args else [ ]
2020-02-11 18:00:16 +00:00
column = args [ ' column ' ] if ( isinstance ( args [ ' column ' ] , list ) ) else [ args [ ' column ' ] ]
2020-02-25 17:41:40 +00:00
# column_id = args['id']
2020-01-01 05:27:53 +00:00
#
#@TODO:
# If the identifier is not present, we should fine a way to determine or make one
#
2020-03-12 19:37:01 +00:00
BIN_SIZE = 4 if ' bin_size ' not in args else int ( args [ ' bin_size ' ] )
2020-04-29 06:27:25 +00:00
# NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
2020-04-14 20:14:38 +00:00
bhandler = Binary ( )
2020-02-11 18:00:16 +00:00
_df = df . copy ( )
for col in column :
args [ ' context ' ] = col
args [ ' column ' ] = col
2020-02-29 03:37:26 +00:00
2020-04-15 14:18:06 +00:00
msize = args [ ' matrix_size ' ] if ' matrix_size ' in args else - 1
values = bhandler . get_column ( df [ col ] , msize )
2020-04-29 06:27:25 +00:00
MISSING = bhandler . get_missing ( df [ col ] , msize )
2020-03-08 13:48:38 +00:00
2020-02-29 03:37:26 +00:00
2020-03-12 19:37:01 +00:00
2020-02-29 03:37:26 +00:00
args [ ' values ' ] = values
args [ ' row_count ' ] = df . shape [ 0 ]
2020-04-29 06:27:25 +00:00
# if col in NO_VALUE :
# args['no_value'] = NO_VALUE[col]
# else:
# args['no_value'] = NO_VALUE
# novalue = NO_VALUE[col] if NO_VALUE[col] in ['na',''] else NO_VALUE[col]
# MISSING += [NO_VALUE[col]]
args [ ' missing ' ] = MISSING
2020-02-11 18:00:16 +00:00
#
# we can determine the cardinalities here so we know what to allow or disallow
2020-03-04 17:49:18 +00:00
handler = gan . Predict ( * * args )
2020-02-11 18:00:16 +00:00
handler . load_meta ( col )
2020-03-04 17:49:18 +00:00
r = handler . apply ( )
2020-03-25 22:43:23 +00:00
if col in CONTINUOUS :
2020-04-29 06:27:25 +00:00
r [ col ] = np . array ( r [ col ] )
_approx = ContinuousToDiscrete . continuous ( r [ col ] , BIN_SIZE ) #-- approximating based on arbitrary bins
r [ col ] = _approx
2020-03-25 22:43:23 +00:00
2020-04-01 05:21:51 +00:00
_df [ col ] = r [ col ]
2020-02-29 03:37:26 +00:00
#
2020-04-14 06:54:11 +00:00
# Let's cast the type to the original type (it makes the data more usable)
#
2020-04-29 06:27:25 +00:00
# print (values)
# print ([col,df[col].dtype,_df[col].tolist()])
2020-04-14 06:54:11 +00:00
otype = df [ col ] . dtype
_df [ col ] = _df [ col ] . astype ( otype )
2020-04-29 06:27:25 +00:00
2020-04-14 06:54:11 +00:00
#
2020-02-29 03:37:26 +00:00
# @TODO: log basic stats about the synthetic attribute
#
2020-03-04 17:49:18 +00:00
# print (r)s
2020-02-11 18:00:16 +00:00
# break
2020-03-14 16:12:13 +00:00
2020-01-10 15:53:23 +00:00
return _df