""" (c) 2019 Data Maker, hiplab.mc.vanderbilt.edu version 1.0.0 This package serves as a proxy to the overall usage of the framework. This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques @TODO: - Make configurable GPU, EPOCHS """ import pandas as pd import numpy as np import data.gan as gan from transport import factory from data.bridge import Binary import threading as thread from data.maker import prepare import copy import os import json class ContinuousToDiscrete : ROUND_UP = 2 @staticmethod def binary(X,n=4) : """ This function will convert a continous stream of information into a variety a bit stream of bins """ values = np.array(X).astype(np.float32) BOUNDS = ContinuousToDiscrete.bounds(values,n) matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n) @staticmethod def bounds(x,n): # return np.array_split(x,n) values = np.round(x,ContinuousToDiscrete.ROUND_UP) return list(pd.cut(values,n).categories) @staticmethod def continuous(X,BIN_SIZE=4) : """ This function will approximate a binary vector given boundary information :X binary matrix :BIN_SIZE """ BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE) values = [] # _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE) # # # print (BOUNDS) l = {} for i in np.arange(len(X)): #value in X : value = X[i] for item in BOUNDS : if value >= item.left and value <= item.right : values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)] break # values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ] # # values = [] # for row in _BINARY : # # ubound = BOUNDS[row.index(1)] # index = np.where(row == 1)[0][0] # ubound = BOUNDS[ index ].right # lbound = BOUNDS[ index ].left # x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float) # values.append(x_) # lbound = ubound # values = [np.random.uniform() for item in BOUNDS] return values def train (**_args): """ :params sql :params store """ _inputhandler = prepare.Input(**_args) values,_matrix = _inputhandler.convert() args = {"real":_matrix,"context":_args['context']} _map = {} if 'store' in _args : # # This args['store'] = copy.deepcopy(_args['store']['logs']) args['store']['args']['doc'] = _args['context'] logger = factory.instance(**args['store']) args['logger'] = logger for key in _inputhandler._map : beg = _inputhandler._map[key]['beg'] end = _inputhandler._map[key]['end'] values = _inputhandler._map[key]['values'].tolist() _map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()} info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map} logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io}) args['logs'] = _args['logs'] if 'logs' in _args else 'logs' args ['max_epochs'] = _args['max_epochs'] args['matrix_size'] = _matrix.shape[0] args['batch_size'] = 2000 if 'partition' in _args : args['partition'] = _args['partition'] if 'gpu' in _args : args['gpu'] = _args['gpu'] # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0' trainer = gan.Train(**args) # # @TODO: Write the map.json in the output directory for the logs # # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w') f = open(os.sep.join([trainer.out_dir,'map.json']),'w') f.write(json.dumps(_map)) f.close() trainer.apply() pass def get(**args): """ This function will restore a checkpoint from a persistant storage on to disk """ pass def generate(**_args): """ This function will generate a set of records, before we must load the parameters needed :param data :param context :param logs """ partition = _args['partition'] if 'partition' in _args else None if not partition : LOG_DIR = os.sep.join([_args['logs'],'output',_args['context']]) # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json'])) else: LOG_DIR = os.sep.join([_args['logs'],'output',_args['context'],str(partition)]) # f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json'])) f = open(os.sep.join([LOG_DIR,'map.json'])) _map = json.loads(f.read()) f.close() # if 'file' in _args : # df = pd.read_csv(_args['file']) # else: # df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data']) args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']} args['logs'] = LOG_DIR if 'logs' in _args else 'logs' args ['max_epochs'] = _args['max_epochs'] # args['matrix_size'] = _matrix.shape[0] args['batch_size'] = 2000 args['partition'] = 0 if 'partition' not in _args else _args['partition'] args['row_count'] = _args['data'].shape[0] # # @TODO: perhaps get the space of values here ... (not sure it's a good idea) # _args['map'] = _map _inputhandler = prepare.Input(**_args) values,_matrix = _inputhandler.convert() args['values'] = np.array(values) if 'gpu' in _args : args['gpu'] = _args['gpu'] handler = gan.Predict (**args) lparams = {'columns':None} if partition : lparams['partition'] = partition handler.load_meta(**lparams) # # Let us now format the matrices by reverting them to a data-frame with values # candidates = handler.apply(candidates=args['candidates']) return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]