2020-01-01 05:27:53 +00:00
"""
( c ) 2019 Data Maker , hiplab . mc . vanderbilt . edu
version 1.0 .0
This package serves as a proxy to the overall usage of the framework .
This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques
@TODO :
- Make configurable GPU , EPOCHS
"""
import pandas as pd
import numpy as np
2020-01-05 05:02:15 +00:00
import data . gan as gan
2020-01-04 03:47:05 +00:00
from transport import factory
2020-02-18 08:59:39 +00:00
from data . bridge import Binary
2020-02-11 18:00:16 +00:00
import threading as thread
2021-03-29 16:10:57 +00:00
from data . maker import prepare
import copy
import os
import json
2020-02-29 03:37:26 +00:00
class ContinuousToDiscrete :
2020-03-04 17:49:18 +00:00
ROUND_UP = 2
2020-02-29 03:37:26 +00:00
@staticmethod
def binary ( X , n = 4 ) :
"""
This function will convert a continous stream of information into a variety a bit stream of bins
"""
2020-03-12 19:37:01 +00:00
values = np . array ( X ) . astype ( np . float32 )
2020-03-12 14:41:54 +00:00
BOUNDS = ContinuousToDiscrete . bounds ( values , n )
2020-04-01 05:21:51 +00:00
matrix = np . repeat ( np . zeros ( n ) , len ( X ) ) . reshape ( len ( X ) , n )
2020-02-29 03:37:26 +00:00
@staticmethod
def bounds ( x , n ) :
2020-03-07 15:16:17 +00:00
# return np.array_split(x,n)
2020-03-12 14:41:54 +00:00
values = np . round ( x , ContinuousToDiscrete . ROUND_UP )
return list ( pd . cut ( values , n ) . categories )
2020-02-29 03:37:26 +00:00
@staticmethod
def continuous ( X , BIN_SIZE = 4 ) :
"""
This function will approximate a binary vector given boundary information
: X binary matrix
: BIN_SIZE
"""
BOUNDS = ContinuousToDiscrete . bounds ( X , BIN_SIZE )
values = [ ]
2020-03-25 22:43:23 +00:00
# _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
# # # print (BOUNDS)
l = { }
2020-04-01 05:21:51 +00:00
for i in np . arange ( len ( X ) ) : #value in X :
value = X [ i ]
2020-03-25 22:43:23 +00:00
2020-04-01 05:21:51 +00:00
for item in BOUNDS :
if value > = item . left and value < = item . right :
values + = [ np . round ( np . random . uniform ( item . left , item . right ) , ContinuousToDiscrete . ROUND_UP ) ]
break
# values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ]
2020-02-29 03:37:26 +00:00
2020-03-25 22:43:23 +00:00
# # values = []
# for row in _BINARY :
# # ubound = BOUNDS[row.index(1)]
# index = np.where(row == 1)[0][0]
# ubound = BOUNDS[ index ].right
# lbound = BOUNDS[ index ].left
2020-02-29 03:37:26 +00:00
2020-03-25 22:43:23 +00:00
# x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)
# values.append(x_)
2020-02-29 03:37:26 +00:00
2020-03-25 22:43:23 +00:00
# lbound = ubound
# values = [np.random.uniform() for item in BOUNDS]
2020-02-29 03:37:26 +00:00
return values
2021-03-29 16:10:57 +00:00
def train ( * * _args ) :
"""
: params sql
: params store
"""
2021-03-30 09:56:01 +00:00
2021-03-29 16:10:57 +00:00
_inputhandler = prepare . Input ( * * _args )
values , _matrix = _inputhandler . convert ( )
args = { " real " : _matrix , " context " : _args [ ' context ' ] }
_map = { }
if ' store ' in _args :
#
# This
2021-03-30 09:56:01 +00:00
2021-03-29 16:10:57 +00:00
args [ ' store ' ] = copy . deepcopy ( _args [ ' store ' ] [ ' logs ' ] )
args [ ' store ' ] [ ' args ' ] [ ' doc ' ] = _args [ ' context ' ]
logger = factory . instance ( * * args [ ' store ' ] )
args [ ' logger ' ] = logger
for key in _inputhandler . _map :
beg = _inputhandler . _map [ key ] [ ' beg ' ]
end = _inputhandler . _map [ key ] [ ' end ' ]
values = _inputhandler . _map [ key ] [ ' values ' ] . tolist ( )
_map [ key ] = { " beg " : beg , " end " : end , " values " : np . array ( values ) . astype ( str ) . tolist ( ) }
info = { " rows " : _matrix . shape [ 0 ] , " cols " : _matrix . shape [ 1 ] , " map " : _map }
2021-03-30 21:14:48 +00:00
logger . write ( { " module " : " gan-train " , " action " : " data-prep " , " context " : _args [ ' context ' ] , " input " : _inputhandler . _io } )
2021-03-29 16:10:57 +00:00
args [ ' logs ' ] = _args [ ' logs ' ] if ' logs ' in _args else ' logs '
args [ ' max_epochs ' ] = _args [ ' max_epochs ' ]
args [ ' matrix_size ' ] = _matrix . shape [ 0 ]
args [ ' batch_size ' ] = 2000
2021-04-07 20:30:59 +00:00
if ' partition ' in _args :
args [ ' partition ' ] = _args [ ' partition ' ]
2021-04-01 18:20:35 +00:00
if ' gpu ' in _args :
args [ ' gpu ' ] = _args [ ' gpu ' ]
2021-04-01 18:09:06 +00:00
# os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
2020-02-29 03:37:26 +00:00
2021-03-29 16:10:57 +00:00
trainer = gan . Train ( * * args )
#
# @TODO: Write the map.json in the output directory for the logs
#
2021-04-07 20:30:59 +00:00
# f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w')
f = open ( os . sep . join ( [ trainer . out_dir , ' map.json ' ] ) , ' w ' )
2021-03-29 16:10:57 +00:00
f . write ( json . dumps ( _map ) )
f . close ( )
trainer . apply ( )
pass
2020-03-08 13:48:38 +00:00
2020-02-12 18:43:30 +00:00
def get ( * * args ) :
2020-02-11 18:00:16 +00:00
"""
This function will restore a checkpoint from a persistant storage on to disk
"""
pass
2021-03-29 16:10:57 +00:00
def generate ( * * _args ) :
"""
This function will generate a set of records , before we must load the parameters needed
: param data
: param context
: param logs
"""
2021-04-07 20:30:59 +00:00
partition = _args [ ' partition ' ] if ' partition ' in _args else None
if not partition :
2021-05-10 20:02:55 +00:00
MAP_FLDER = os . sep . join ( [ _args [ ' logs ' ] , ' output ' , _args [ ' context ' ] ] )
2021-05-10 19:33:18 +00:00
# f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
2021-04-07 20:30:59 +00:00
else :
2021-05-10 20:02:55 +00:00
MAP_FOLDER = os . sep . join ( [ _args [ ' logs ' ] , ' output ' , _args [ ' context ' ] , str ( partition ) ] )
2021-05-10 19:33:18 +00:00
# f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json']))
2021-05-10 20:02:55 +00:00
f = open ( os . sep . join ( [ MAP_FOLDER , ' map.json ' ] ) )
2021-03-29 16:10:57 +00:00
_map = json . loads ( f . read ( ) )
f . close ( )
2021-05-10 20:02:55 +00:00
#
#
2021-03-30 14:00:57 +00:00
# if 'file' in _args :
# df = pd.read_csv(_args['file'])
# else:
# df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])
2021-03-29 16:10:57 +00:00
args = { " context " : _args [ ' context ' ] , " max_epochs " : _args [ ' max_epochs ' ] , " candidates " : _args [ ' candidates ' ] }
2021-05-10 20:02:55 +00:00
args [ ' logs ' ] = _args [ ' logs ' ] if ' logs ' in _args else ' logs '
2021-03-29 16:10:57 +00:00
args [ ' max_epochs ' ] = _args [ ' max_epochs ' ]
# args['matrix_size'] = _matrix.shape[0]
args [ ' batch_size ' ] = 2000
args [ ' partition ' ] = 0 if ' partition ' not in _args else _args [ ' partition ' ]
2021-03-30 14:00:57 +00:00
args [ ' row_count ' ] = _args [ ' data ' ] . shape [ 0 ]
2021-03-29 16:10:57 +00:00
#
# @TODO: perhaps get the space of values here ... (not sure it's a good idea)
#
_args [ ' map ' ] = _map
_inputhandler = prepare . Input ( * * _args )
values , _matrix = _inputhandler . convert ( )
args [ ' values ' ] = np . array ( values )
2021-04-01 18:20:35 +00:00
if ' gpu ' in _args :
args [ ' gpu ' ] = _args [ ' gpu ' ]
2021-04-01 18:09:06 +00:00
2021-03-29 16:10:57 +00:00
handler = gan . Predict ( * * args )
2021-05-10 19:43:29 +00:00
lparams = { ' columns ' : None }
if partition :
lparams [ ' partition ' ] = partition
2021-05-10 19:49:08 +00:00
handler . load_meta ( * * lparams )
2021-03-29 16:10:57 +00:00
#
2021-03-29 23:53:57 +00:00
# Let us now format the matrices by reverting them to a data-frame with values
2021-03-29 16:10:57 +00:00
#
candidates = handler . apply ( candidates = args [ ' candidates ' ] )
return [ _inputhandler . revert ( matrix = _matrix ) for _matrix in candidates ]