data-maker/data/maker/__init__.py

"""
(c) 2019 Data Maker, hiplab.mc.vanderbilt.edu
version 1.0.0

This package serves as a proxy to the overall usage of the framework.
This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques

@TODO:
    - Make configurable GPU, EPOCHS
"""
import pandas as pd
import numpy as np
import data.gan as gan
from transport import factory
from data.bridge import Binary
import threading as thread
from data.maker import prepare
import copy
import os
import json

class ContinuousToDiscrete :
    ROUND_UP = 2
    @staticmethod
    def binary(X,n=4) :
        """
        This function will convert a continous stream of information into a variety a bit stream of bins
        """
        values = np.array(X).astype(np.float32)
        BOUNDS = ContinuousToDiscrete.bounds(values,n)
        matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n)

    
    @staticmethod
    def bounds(x,n):
        # return np.array_split(x,n)
        values = np.round(x,ContinuousToDiscrete.ROUND_UP)
        return list(pd.cut(values,n).categories)
        

        
    @staticmethod
    def continuous(X,BIN_SIZE=4) :
        """
        This function will approximate a binary vector given boundary information
        :X  binary matrix
        :BIN_SIZE
        """
        BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
        
        values = []
        # _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
        # # # print (BOUNDS)
        l = {}
        for i in np.arange(len(X)): #value in X :
            
            value = X[i]
            
            for item in BOUNDS :
                if value >= item.left and value <= item.right :
                    values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)]
                    break
            # values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if  value >= item.left and value <= item.right ]
            
    
        # # values = []
        # for row in _BINARY :
        #     # ubound = BOUNDS[row.index(1)]
        #     index = np.where(row == 1)[0][0]
            
        #     ubound = BOUNDS[ index ].right
        #     lbound = BOUNDS[ index ].left
            
        #     x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)            
        #     values.append(x_)
            
        #     lbound = ubound

        # values = [np.random.uniform() for item in BOUNDS]
        
        return values
            

def train (**_args):
    """
    :params sql
    :params store
    """
    
    _inputhandler = prepare.Input(**_args)
    values,_matrix = _inputhandler.convert()
    args  = {"real":_matrix,"context":_args['context']}
    _map = {}
    if 'store' in _args :
        #
        # This 
        
        args['store'] = copy.deepcopy(_args['store']['logs'])
        args['store']['args']['doc'] = _args['context']
        logger = factory.instance(**args['store'])
        args['logger'] = logger
        
        for key in _inputhandler._map :
            beg = _inputhandler._map[key]['beg']
            end = _inputhandler._map[key]['end']
            values = _inputhandler._map[key]['values'].tolist()
            _map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()}
        info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map}
        logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io})
    
    args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
    args ['max_epochs'] = _args['max_epochs']
    args['matrix_size'] = _matrix.shape[0]
    args['batch_size'] = 2000
    if 'partition' in _args :
        args['partition'] = _args['partition']
    if 'gpu' in _args :
        args['gpu'] = _args['gpu']
    # os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
    
    trainer = gan.Train(**args)   
    #
    # @TODO: Write the map.json in the output directory for the logs
    # 
    # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w')
    f = open(os.sep.join([trainer.out_dir,'map.json']),'w')
    f.write(json.dumps(_map))
    f.close()

    trainer.apply()
    pass    

def get(**args):
    """
    This function will restore a checkpoint from a persistant storage on to disk
    """
    pass
def generate(**_args):
    """
    This function will generate a set of records, before we must load the parameters needed
    :param data
    :param context
    :param logs
    """
    partition = _args['partition'] if 'partition' in _args else None
    if not partition :
        LOG_DIR = os.sep.join([_args['logs'],'output',_args['context']])
        # f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
    else:
        LOG_DIR = os.sep.join([_args['logs'],'output',_args['context'],str(partition)])
        # f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json']))
    f = open(os.sep.join([LOG_DIR,'map.json']))
    _map = json.loads(f.read())
    f.close()
    # if 'file' in _args :
    #     df = pd.read_csv(_args['file'])
    # else:
    #     df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])
    args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']}
    args['logs'] = LOG_DIR if 'logs' in _args else 'logs'
    args ['max_epochs'] = _args['max_epochs']
    # args['matrix_size'] = _matrix.shape[0]
    args['batch_size'] = 2000
    args['partition'] = 0 if 'partition' not in _args else _args['partition']
    args['row_count'] = _args['data'].shape[0]
    #
    # @TODO: perhaps get the space of values here ... (not sure it's a good idea)
    #
    _args['map']  = _map
    _inputhandler = prepare.Input(**_args)
    values,_matrix = _inputhandler.convert()    
    args['values'] = np.array(values)
    if 'gpu' in _args :
        args['gpu'] = _args['gpu']
       
    handler     = gan.Predict (**args)
    lparams = {'columns':None}
    if partition :
        lparams['partition'] = partition
    
    handler.load_meta(**lparams)
    #
    # Let us now format the matrices by reverting them to a data-frame with values
    #

    candidates = handler.apply(candidates=args['candidates'])       
    return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]
fixes with the framework - only supports single feature 2020-01-01 05:27:53 +00:00			`"""`
			`(c) 2019 Data Maker, hiplab.mc.vanderbilt.edu`
			`version 1.0.0`

			`This package serves as a proxy to the overall usage of the framework.`
			`This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques`

			`@TODO:`
			`- Make configurable GPU, EPOCHS`
			`"""`
			`import pandas as pd`
			`import numpy as np`
bug fix with imports 2020-01-05 05:02:15 +00:00			`import data.gan as gan`
bug fixes with operations 2020-01-04 03:47:05 +00:00			`from transport import factory`
bug fix with binary matrix generation 2020-02-18 08:59:39 +00:00			`from data.bridge import Binary`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`import threading as thread`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`from data.maker import prepare`
			`import copy`
			`import os`
			`import json`

Handling of continous values 2020-02-29 03:37:26 +00:00			`class ContinuousToDiscrete :`
bug fix and upgrades to base functionalities 2020-03-04 17:49:18 +00:00			`ROUND_UP = 2`
Handling of continous values 2020-02-29 03:37:26 +00:00			`@staticmethod`
			`def binary(X,n=4) :`
			`"""`
			`This function will convert a continous stream of information into a variety a bit stream of bins`
			`"""`
bug fix ... 2020-03-12 19:37:01 +00:00			`values = np.array(X).astype(np.float32)`
bug fix: continuous variable handling 2020-03-12 14:41:54 +00:00			`BOUNDS = ContinuousToDiscrete.bounds(values,n)`
bug fixes and optimizations 2020-04-01 05:21:51 +00:00			`matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n)`

Handling of continous values 2020-02-29 03:37:26 +00:00
			`@staticmethod`
			`def bounds(x,n):`
bug fix, trainer 2020-03-07 15:16:17 +00:00			`# return np.array_split(x,n)`
bug fix: continuous variable handling 2020-03-12 14:41:54 +00:00			`values = np.round(x,ContinuousToDiscrete.ROUND_UP)`
			`return list(pd.cut(values,n).categories)`
Handling of continous values 2020-02-29 03:37:26 +00:00


			`@staticmethod`
			`def continuous(X,BIN_SIZE=4) :`
			`"""`
			`This function will approximate a binary vector given boundary information`
			`:X binary matrix`
			`:BIN_SIZE`
			`"""`
			`BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)`

			`values = []`
bug fix with ICD and some minor improvements 2020-03-25 22:43:23 +00:00			`# _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)`
			`# # # print (BOUNDS)`
			`l = {}`
bug fixes and optimizations 2020-04-01 05:21:51 +00:00			`for i in np.arange(len(X)): #value in X :`

			`value = X[i]`
bug fix with ICD and some minor improvements 2020-03-25 22:43:23 +00:00
bug fixes and optimizations 2020-04-01 05:21:51 +00:00			`for item in BOUNDS :`
			`if value >= item.left and value <= item.right :`
			`values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)]`
			`break`
			`# values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ]`
Handling of continous values 2020-02-29 03:37:26 +00:00
bug fix with ICD and some minor improvements 2020-03-25 22:43:23 +00:00
			`# # values = []`
			`# for row in _BINARY :`
			`# # ubound = BOUNDS[row.index(1)]`
			`# index = np.where(row == 1)[0][0]`

			`# ubound = BOUNDS[ index ].right`
			`# lbound = BOUNDS[ index ].left`
Handling of continous values 2020-02-29 03:37:26 +00:00
bug fix with ICD and some minor improvements 2020-03-25 22:43:23 +00:00			`# x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)`
			`# values.append(x_)`
Handling of continous values 2020-02-29 03:37:26 +00:00
bug fix with ICD and some minor improvements 2020-03-25 22:43:23 +00:00			`# lbound = ubound`

			`# values = [np.random.uniform() for item in BOUNDS]`
Handling of continous values 2020-02-29 03:37:26 +00:00
			`return values`


bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`def train (**_args):`
			`"""`
			`:params sql`
			`:params store`
			`"""`
bug fixes 2021-03-30 09:56:01 +00:00
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`_inputhandler = prepare.Input(**_args)`
			`values,_matrix = _inputhandler.convert()`
			`args = {"real":_matrix,"context":_args['context']}`
			`_map = {}`
			`if 'store' in _args :`
			`#`
			`# This`
bug fixes 2021-03-30 09:56:01 +00:00
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`args['store'] = copy.deepcopy(_args['store']['logs'])`
			`args['store']['args']['doc'] = _args['context']`
			`logger = factory.instance(**args['store'])`
			`args['logger'] = logger`

			`for key in _inputhandler._map :`
			`beg = _inputhandler._map[key]['beg']`
			`end = _inputhandler._map[key]['end']`
			`values = _inputhandler._map[key]['values'].tolist()`
			`_map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()}`
			`info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map}`
bug fix: log information about space 2021-03-30 21:14:48 +00:00			`logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io})`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00
			`args['logs'] = _args['logs'] if 'logs' in _args else 'logs'`
			`args ['max_epochs'] = _args['max_epochs']`
			`args['matrix_size'] = _matrix.shape[0]`
			`args['batch_size'] = 2000`
feature: bootstrap-like with candidates 2021-04-07 20:30:59 +00:00			`if 'partition' in _args :`
			`args['partition'] = _args['partition']`
gpu indexing 2021-04-01 18:20:35 +00:00			`if 'gpu' in _args :`
			`args['gpu'] = _args['gpu']`
gpu indexing 2021-04-01 18:09:06 +00:00			`# os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'`
Handling of continous values 2020-02-29 03:37:26 +00:00
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`trainer = gan.Train(**args)`
			`#`
			`# @TODO: Write the map.json in the output directory for the logs`
			`#`
feature: bootstrap-like with candidates 2021-04-07 20:30:59 +00:00			`# f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w')`
			`f = open(os.sep.join([trainer.out_dir,'map.json']),'w')`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`f.write(json.dumps(_map))`
			`f.close()`

			`trainer.apply()`
			`pass`
bug fix with partition & data -access 2020-03-08 13:48:38 +00:00
bug fix ... need to design porting/loading models on the fly 2020-02-12 18:43:30 +00:00			`def get(**args):`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`"""`
			`This function will restore a checkpoint from a persistant storage on to disk`
			`"""`
			`pass`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`def generate(**_args):`
			`"""`
			`This function will generate a set of records, before we must load the parameters needed`
			`:param data`
			`:param context`
			`:param logs`
			`"""`
feature: bootstrap-like with candidates 2021-04-07 20:30:59 +00:00			`partition = _args['partition'] if 'partition' in _args else None`
			`if not partition :`
bug fix ... 2021-05-10 19:33:18 +00:00			`LOG_DIR = os.sep.join([_args['logs'],'output',_args['context']])`
			`# f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))`
feature: bootstrap-like with candidates 2021-04-07 20:30:59 +00:00			`else:`
bug fix ... 2021-05-10 19:33:18 +00:00			`LOG_DIR = os.sep.join([_args['logs'],'output',_args['context'],str(partition)])`
			`# f = open(os.sep.join([_args['logs'],'output',_args['context'],str(partition),'map.json']))`
			`f = open(os.sep.join([LOG_DIR,'map.json']))`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`_map = json.loads(f.read())`
			`f.close()`
optimization (minor) 2021-03-30 14:00:57 +00:00			`# if 'file' in _args :`
			`# df = pd.read_csv(_args['file'])`
			`# else:`
			`# df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']}`
bug fix ... 2021-05-10 19:33:18 +00:00			`args['logs'] = LOG_DIR if 'logs' in _args else 'logs'`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`args ['max_epochs'] = _args['max_epochs']`
			`# args['matrix_size'] = _matrix.shape[0]`
			`args['batch_size'] = 2000`
			`args['partition'] = 0 if 'partition' not in _args else _args['partition']`
optimization (minor) 2021-03-30 14:00:57 +00:00			`args['row_count'] = _args['data'].shape[0]`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`#`
			`# @TODO: perhaps get the space of values here ... (not sure it's a good idea)`
			`#`
			`_args['map'] = _map`
			`_inputhandler = prepare.Input(**_args)`
			`values,_matrix = _inputhandler.convert()`
			`args['values'] = np.array(values)`
gpu indexing 2021-04-01 18:20:35 +00:00			`if 'gpu' in _args :`
			`args['gpu'] = _args['gpu']`
gpu indexing 2021-04-01 18:09:06 +00:00
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`handler = gan.Predict (**args)`
bug fix 2021-05-10 19:43:29 +00:00			`lparams = {'columns':None}`
			`if partition :`
			`lparams['partition'] = partition`

bug fixes .... 2021-05-10 19:49:08 +00:00			`handler.load_meta(**lparams)`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`#`
new features, bug fixes 2021-03-29 23:53:57 +00:00			`# Let us now format the matrices by reverting them to a data-frame with values`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`#`

			`candidates = handler.apply(candidates=args['candidates'])`
			`return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]`