data-maker/data/maker/__init__.py

"""
(c) 2019 Data Maker, hiplab.mc.vanderbilt.edu
version 1.0.0

This package serves as a proxy to the overall usage of the framework.
This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques

@TODO:
    - Make configurable GPU, EPOCHS
"""
import pandas as pd
import numpy as np
import data.gan as gan
from transport import factory
from data.bridge import Binary
import threading as thread
from data.maker import prepare
import copy
import os
import json

class ContinuousToDiscrete :
    ROUND_UP = 2
    @staticmethod
    def binary(X,n=4) :
        """
        This function will convert a continous stream of information into a variety a bit stream of bins
        """
        values = np.array(X).astype(np.float32)
        BOUNDS = ContinuousToDiscrete.bounds(values,n)
        matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n)

    
    @staticmethod
    def bounds(x,n):
        # return np.array_split(x,n)
        values = np.round(x,ContinuousToDiscrete.ROUND_UP)
        return list(pd.cut(values,n).categories)
        

    @staticmethod
    def continuous(X,BIN_SIZE=4) :
        """
        This function will approximate a binary vector given boundary information
        :X  binary matrix
        :BIN_SIZE
        """
        BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)
        
        values = []
        # _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)
        # # # print (BOUNDS)
        l = {}
        for i in np.arange(len(X)): #value in X :
            
            value = X[i]
            
            for item in BOUNDS :
                if value >= item.left and value <= item.right :
                    values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)]
                    break
            # values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if  value >= item.left and value <= item.right ]
            
    
        # # values = []
        # for row in _BINARY :
        #     # ubound = BOUNDS[row.index(1)]
        #     index = np.where(row == 1)[0][0]
            
        #     ubound = BOUNDS[ index ].right
        #     lbound = BOUNDS[ index ].left
            
        #     x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)            
        #     values.append(x_)
            
        #     lbound = ubound

        # values = [np.random.uniform() for item in BOUNDS]
        
        return values
            

def train (**_args):
    """
    :params sql
    :params store
    """
    #
    # Let us prepare the data by calling the utility function
    #
    # if 'file' in _args :
    #     #
    #     # We are reading data from a file
    #     _args['data'] = pd.read_csv(_args['file'])
    # else:
    #     #
    #     # data will be read from elsewhere (a data-store)...
    #     pass        
    # if 'ignore' in _args and 'columns' in _args['ignore']:
    
    _inputhandler = prepare.Input(**_args)
    values,_matrix = _inputhandler.convert()
    args  = {"real":_matrix,"context":_args['context']}
    _map = {}
    if 'store' in _args :
        #
        # This 
        
        args['store'] = copy.deepcopy(_args['store']['logs'])
        args['store']['args']['doc'] = _args['context']
        logger = factory.instance(**args['store'])
        args['logger'] = logger
        
        for key in _inputhandler._map :
            beg = _inputhandler._map[key]['beg']
            end = _inputhandler._map[key]['end']
            values = _inputhandler._map[key]['values'].tolist()
            _map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()}
        info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map}
        logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io})
    
    args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
    args ['max_epochs'] = _args['max_epochs']
    args['matrix_size'] = _matrix.shape[0]
    args['batch_size'] = 2000
    args['partition'] = 0 if 'partition' not in _args else _args['partition']
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
    
    trainer = gan.Train(**args)   
    #
    # @TODO: Write the map.json in the output directory for the logs
    # 
    f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w')
    f.write(json.dumps(_map))
    f.close()

    trainer.apply()
    pass    
def _train (**args) :
    """
    This function is intended to train the GAN in order to learn about the distribution of the features
    :column     columns that need to be synthesized (discrete)
    :logs       where the output of the (location on disk)
    :id         identifier of the dataset
    :data       data-frame to be synthesized
    :context    label of what we are synthesizing
    """
    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
    # CONTINUOUS  = args['continuous'] if 'continuous' in args else []
    # column_id   = args['id']
    df          = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
    df.columns = [name.lower() for name in df.columns]
    #
    # @TODO:
    # Consider sequential training of sub population for extremely large datasets
    #
    
    #
    # If we have several columns we will proceed one at a time (it could be done in separate threads)
    # @TODO : Consider performing this task on several threads/GPUs simulataneously
    # 
    for col in column : 
        msize = args['matrix_size'] if 'matrix_size' in args else -1        
        args['real'] = (Binary()).apply(df[col],msize)

        context     = args['context']
        if 'store' in args :
            args['store']['args']['doc'] = context
            logger = factory.instance(**args['store'])
            args['logger'] = logger
            info = {"rows":args['real'].shape[0],"cols":args['real'].shape[1],"name":col,"partition":args['partition']}
            logger.write({"module":"gan-train","action":"data-prep","input":info})
            
        else:
            logger = None
        args['column']  = col
        args['context'] = col

        #
        # If the s
        trainer = gan.Train(**args)        
        trainer.apply()
def get(**args):
    """
    This function will restore a checkpoint from a persistant storage on to disk
    """
    pass
def generate(**_args):
    """
    This function will generate a set of records, before we must load the parameters needed
    :param data
    :param context
    :param logs
    """
    f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))
    _map = json.loads(f.read())
    f.close()
    # if 'file' in _args :
    #     df = pd.read_csv(_args['file'])
    # else:
    #     df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])
    args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']}
    args['logs'] = _args['logs'] if 'logs' in _args else 'logs'
    args ['max_epochs'] = _args['max_epochs']
    # args['matrix_size'] = _matrix.shape[0]
    args['batch_size'] = 2000
    args['partition'] = 0 if 'partition' not in _args else _args['partition']
    args['row_count'] = _args['data'].shape[0]
    #
    # @TODO: perhaps get the space of values here ... (not sure it's a good idea)
    #
    _args['map']  = _map
    _inputhandler = prepare.Input(**_args)
    values,_matrix = _inputhandler.convert()    
    args['values'] = np.array(values)
    if 'gpu' in _args :
        os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
    handler     = gan.Predict (**args)
    handler.load_meta(None)
    #
    # Let us now format the matrices by reverting them to a data-frame with values
    #

    candidates = handler.apply(candidates=args['candidates'])       
    return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]
    

def _generate(**args):
    """
    This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
    @return pandas.DataFrame

    :data   data-frame to be synthesized
    :column   columns that need to be synthesized (discrete)
    :id     column identifying an entity
    :logs   location on disk where the learnt knowledge of the dataset is
    """
    # df      = args['data']
    df      = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
    
    CONTINUOUS = args['continuous'] if 'continuous' in args else []
    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
    # column_id   = args['id']
    #
    #@TODO:
    #   If the identifier is not present, we should fine a way to determine or make one
    #
    BIN_SIZE    = 4 if 'bin_size' not in args else int(args['bin_size'])
    # NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
    bhandler = Binary()    
    _df     = df.copy()
    for col in column :
        args['context'] = col
        args['column']  = col
        
        msize = args['matrix_size'] if 'matrix_size' in args else -1        
        values = bhandler.get_column(df[col],msize)
        MISSING= bhandler.get_missing(df[col],msize)
        
        
        args['values']      = values    
        args['row_count']   = df.shape[0]
        # if col in NO_VALUE :
        #     args['no_value'] = NO_VALUE[col] 
        # else:
        #     args['no_value'] = NO_VALUE
        # novalue = NO_VALUE[col] if NO_VALUE[col] in ['na',''] else NO_VALUE[col]
        # MISSING += [NO_VALUE[col]]
        args['missing'] = MISSING
        #
        # we can determine the cardinalities here so we know what to allow or disallow
        handler     = gan.Predict (**args)
        handler.load_meta(col)
        r           =  handler.apply()                
        if col in CONTINUOUS :
            r[col] = np.array(r[col])            
            _approx = ContinuousToDiscrete.continuous(r[col],BIN_SIZE)  #-- approximating based on arbitrary bins                                
            r[col] = _approx
            
            
        _df[col]    = r[col]
        #
        # Let's cast the type to the original type (it makes the data more usable)
        #
        # print (values)
        # print ([col,df[col].dtype,_df[col].tolist()])
        otype       = df[col].dtype
        _df[col]    = _df[col].astype(otype)
        
        #
        # @TODO: log basic stats about the synthetic attribute
        #
        # print (r)s
        # break
        
    return _df
fixes with the framework - only supports single feature 2020-01-01 05:27:53 +00:00			`"""`
			`(c) 2019 Data Maker, hiplab.mc.vanderbilt.edu`
			`version 1.0.0`

			`This package serves as a proxy to the overall usage of the framework.`
			`This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques`

			`@TODO:`
			`- Make configurable GPU, EPOCHS`
			`"""`
			`import pandas as pd`
			`import numpy as np`
bug fix with imports 2020-01-05 05:02:15 +00:00			`import data.gan as gan`
bug fixes with operations 2020-01-04 03:47:05 +00:00			`from transport import factory`
bug fix with binary matrix generation 2020-02-18 08:59:39 +00:00			`from data.bridge import Binary`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`import threading as thread`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`from data.maker import prepare`
			`import copy`
			`import os`
			`import json`

Handling of continous values 2020-02-29 03:37:26 +00:00			`class ContinuousToDiscrete :`
bug fix and upgrades to base functionalities 2020-03-04 17:49:18 +00:00			`ROUND_UP = 2`
Handling of continous values 2020-02-29 03:37:26 +00:00			`@staticmethod`
			`def binary(X,n=4) :`
			`"""`
			`This function will convert a continous stream of information into a variety a bit stream of bins`
			`"""`
bug fix ... 2020-03-12 19:37:01 +00:00			`values = np.array(X).astype(np.float32)`
bug fix: continuous variable handling 2020-03-12 14:41:54 +00:00			`BOUNDS = ContinuousToDiscrete.bounds(values,n)`
bug fixes and optimizations 2020-04-01 05:21:51 +00:00			`matrix = np.repeat(np.zeros(n),len(X)).reshape(len(X),n)`

Handling of continous values 2020-02-29 03:37:26 +00:00
			`@staticmethod`
			`def bounds(x,n):`
bug fix, trainer 2020-03-07 15:16:17 +00:00			`# return np.array_split(x,n)`
bug fix: continuous variable handling 2020-03-12 14:41:54 +00:00			`values = np.round(x,ContinuousToDiscrete.ROUND_UP)`
			`return list(pd.cut(values,n).categories)`
Handling of continous values 2020-02-29 03:37:26 +00:00


			`@staticmethod`
			`def continuous(X,BIN_SIZE=4) :`
			`"""`
			`This function will approximate a binary vector given boundary information`
			`:X binary matrix`
			`:BIN_SIZE`
			`"""`
			`BOUNDS = ContinuousToDiscrete.bounds(X,BIN_SIZE)`

			`values = []`
bug fix with ICD and some minor improvements 2020-03-25 22:43:23 +00:00			`# _BINARY= ContinuousToDiscrete.binary(X,BIN_SIZE)`
			`# # # print (BOUNDS)`
			`l = {}`
bug fixes and optimizations 2020-04-01 05:21:51 +00:00			`for i in np.arange(len(X)): #value in X :`

			`value = X[i]`
bug fix with ICD and some minor improvements 2020-03-25 22:43:23 +00:00
bug fixes and optimizations 2020-04-01 05:21:51 +00:00			`for item in BOUNDS :`
			`if value >= item.left and value <= item.right :`
			`values += [np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP)]`
			`break`
			`# values += [ np.round(np.random.uniform(item.left,item.right),ContinuousToDiscrete.ROUND_UP) for item in BOUNDS if value >= item.left and value <= item.right ]`
Handling of continous values 2020-02-29 03:37:26 +00:00
bug fix with ICD and some minor improvements 2020-03-25 22:43:23 +00:00
			`# # values = []`
			`# for row in _BINARY :`
			`# # ubound = BOUNDS[row.index(1)]`
			`# index = np.where(row == 1)[0][0]`

			`# ubound = BOUNDS[ index ].right`
			`# lbound = BOUNDS[ index ].left`
Handling of continous values 2020-02-29 03:37:26 +00:00
bug fix with ICD and some minor improvements 2020-03-25 22:43:23 +00:00			`# x_ = np.round(np.random.uniform(lbound,ubound),ContinuousToDiscrete.ROUND_UP).astype(float)`
			`# values.append(x_)`
Handling of continous values 2020-02-29 03:37:26 +00:00
bug fix with ICD and some minor improvements 2020-03-25 22:43:23 +00:00			`# lbound = ubound`

			`# values = [np.random.uniform() for item in BOUNDS]`
Handling of continous values 2020-02-29 03:37:26 +00:00
			`return values`



bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`def train (**_args):`
			`"""`
			`:params sql`
			`:params store`
			`"""`
			`#`
			`# Let us prepare the data by calling the utility function`
			`#`
bug fixes 2021-03-30 09:56:01 +00:00			`# if 'file' in _args :`
			`# #`
			`# # We are reading data from a file`
			`# _args['data'] = pd.read_csv(_args['file'])`
			`# else:`
			`# #`
			`# # data will be read from elsewhere (a data-store)...`
			`# pass`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`# if 'ignore' in _args and 'columns' in _args['ignore']:`
bug fixes 2021-03-30 09:56:01 +00:00
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`_inputhandler = prepare.Input(**_args)`
			`values,_matrix = _inputhandler.convert()`
			`args = {"real":_matrix,"context":_args['context']}`
			`_map = {}`
			`if 'store' in _args :`
			`#`
			`# This`
bug fixes 2021-03-30 09:56:01 +00:00
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`args['store'] = copy.deepcopy(_args['store']['logs'])`
			`args['store']['args']['doc'] = _args['context']`
			`logger = factory.instance(**args['store'])`
			`args['logger'] = logger`

			`for key in _inputhandler._map :`
			`beg = _inputhandler._map[key]['beg']`
			`end = _inputhandler._map[key]['end']`
			`values = _inputhandler._map[key]['values'].tolist()`
			`_map[key] = {"beg":beg,"end":end,"values":np.array(values).astype(str).tolist()}`
			`info = {"rows":_matrix.shape[0],"cols":_matrix.shape[1],"map":_map}`
bug fix: log information about space 2021-03-30 21:14:48 +00:00			`logger.write({"module":"gan-train","action":"data-prep","context":_args['context'],"input":_inputhandler._io})`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00
			`args['logs'] = _args['logs'] if 'logs' in _args else 'logs'`
			`args ['max_epochs'] = _args['max_epochs']`
			`args['matrix_size'] = _matrix.shape[0]`
			`args['batch_size'] = 2000`
			`args['partition'] = 0 if 'partition' not in _args else _args['partition']`
			`os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'`
Handling of continous values 2020-02-29 03:37:26 +00:00
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`trainer = gan.Train(**args)`
			`#`
			`# @TODO: Write the map.json in the output directory for the logs`
			`#`
			`f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']),'w')`
			`f.write(json.dumps(_map))`
			`f.close()`

			`trainer.apply()`
			`pass`
			`def _train (**args) :`
fixes with the framework - only supports single feature 2020-01-01 05:27:53 +00:00			`"""`
			`This function is intended to train the GAN in order to learn about the distribution of the features`
			`:column columns that need to be synthesized (discrete)`
			`:logs where the output of the (location on disk)`
			`:id identifier of the dataset`
			`:data data-frame to be synthesized`
			`:context label of what we are synthesizing`
			`"""`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`column = args['column'] if (isinstance(args['column'],list)) else [args['column']]`
bug fix ... 2020-03-12 19:37:01 +00:00			`# CONTINUOUS = args['continuous'] if 'continuous' in args else []`
removing conditions, it blows up computational space 2020-02-18 18:25:47 +00:00			`# column_id = args['id']`
bug fixes 2020-01-10 19:12:58 +00:00			`df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`df.columns = [name.lower() for name in df.columns]`
Handling of continous values 2020-02-29 03:37:26 +00:00			`#`
			`# @TODO:`
			`# Consider sequential training of sub population for extremely large datasets`
			`#`

not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`#`
			`# If we have several columns we will proceed one at a time (it could be done in separate threads)`
			`# @TODO : Consider performing this task on several threads/GPUs simulataneously`
			`#`
Handling of continous values 2020-02-29 03:37:26 +00:00			`for col in column :`
bug fix: with column count 2020-04-15 14:18:06 +00:00			`msize = args['matrix_size'] if 'matrix_size' in args else -1`
limitations on the matrix shape (feature space limitation) per partition 2020-04-14 20:14:38 +00:00			`args['real'] = (Binary()).apply(df[col],msize)`
bug fix with partition & data -access 2020-03-08 13:48:38 +00:00
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`context = args['context']`
			`if 'store' in args :`
			`args['store']['args']['doc'] = context`
			`logger = factory.instance(**args['store'])`
			`args['logger'] = logger`
bug fix, with logs and partitioning 2020-03-09 00:33:08 +00:00			`info = {"rows":args['real'].shape[0],"cols":args['real'].shape[1],"name":col,"partition":args['partition']}`
bug fix with partition & data -access 2020-03-08 13:48:38 +00:00			`logger.write({"module":"gan-train","action":"data-prep","input":info})`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00
			`else:`
			`logger = None`
bug fix with partition & data -access 2020-03-08 13:48:38 +00:00			`args['column'] = col`
			`args['context'] = col`

			`#`
			`# If the s`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`trainer = gan.Train(**args)`
			`trainer.apply()`
bug fix ... need to design porting/loading models on the fly 2020-02-12 18:43:30 +00:00			`def get(**args):`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`"""`
			`This function will restore a checkpoint from a persistant storage on to disk`
			`"""`
			`pass`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`def generate(**_args):`
			`"""`
			`This function will generate a set of records, before we must load the parameters needed`
			`:param data`
			`:param context`
			`:param logs`
			`"""`
			`f = open(os.sep.join([_args['logs'],'output',_args['context'],'map.json']))`
			`_map = json.loads(f.read())`
			`f.close()`
optimization (minor) 2021-03-30 14:00:57 +00:00			`# if 'file' in _args :`
			`# df = pd.read_csv(_args['file'])`
			`# else:`
			`# df = _args['data'] if not isinstance(_args['data'],str) else pd.read_csv(_args['data'])`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`args = {"context":_args['context'],"max_epochs":_args['max_epochs'],"candidates":_args['candidates']}`
			`args['logs'] = _args['logs'] if 'logs' in _args else 'logs'`
			`args ['max_epochs'] = _args['max_epochs']`
			`# args['matrix_size'] = _matrix.shape[0]`
			`args['batch_size'] = 2000`
			`args['partition'] = 0 if 'partition' not in _args else _args['partition']`
optimization (minor) 2021-03-30 14:00:57 +00:00			`args['row_count'] = _args['data'].shape[0]`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`#`
			`# @TODO: perhaps get the space of values here ... (not sure it's a good idea)`
			`#`
			`_args['map'] = _map`
			`_inputhandler = prepare.Input(**_args)`
			`values,_matrix = _inputhandler.convert()`
			`args['values'] = np.array(values)`
			`if 'gpu' in _args :`
			`os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])`
			`handler = gan.Predict (**args)`
			`handler.load_meta(None)`
			`#`
new features, bug fixes 2021-03-29 23:53:57 +00:00			`# Let us now format the matrices by reverting them to a data-frame with values`
bug fixes: design improvements 2021-03-29 16:10:57 +00:00			`#`

			`candidates = handler.apply(candidates=args['candidates'])`
			`return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]`



			`def _generate(**args):`
fixes with the framework - only supports single feature 2020-01-01 05:27:53 +00:00			`"""`
			`This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset`
			`@return pandas.DataFrame`

			`:data data-frame to be synthesized`
			`:column columns that need to be synthesized (discrete)`
			`:id column identifying an entity`
			`:logs location on disk where the learnt knowledge of the dataset is`
			`"""`
bug fixes 2020-01-10 19:12:58 +00:00			`# df = args['data']`
			`df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])`
bug fix with partition & data -access 2020-03-08 13:48:38 +00:00
bug fix: typo 2020-03-09 01:27:27 +00:00			`CONTINUOUS = args['continuous'] if 'continuous' in args else []`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`column = args['column'] if (isinstance(args['column'],list)) else [args['column']]`
bug fix around shape of candidate data to generate 2020-02-25 17:41:40 +00:00			`# column_id = args['id']`
fixes with the framework - only supports single feature 2020-01-01 05:27:53 +00:00			`#`
			`#@TODO:`
			`# If the identifier is not present, we should fine a way to determine or make one`
			`#`
bug fix ... 2020-03-12 19:37:01 +00:00			`BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])`
fix: handling outliers and missing values 2020-04-29 06:27:25 +00:00			`# NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']`
limitations on the matrix shape (feature space limitation) per partition 2020-04-14 20:14:38 +00:00			`bhandler = Binary()`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`_df = df.copy()`
			`for col in column :`
			`args['context'] = col`
			`args['column'] = col`
Handling of continous values 2020-02-29 03:37:26 +00:00
bug fix: with column count 2020-04-15 14:18:06 +00:00			`msize = args['matrix_size'] if 'matrix_size' in args else -1`
			`values = bhandler.get_column(df[col],msize)`
fix: handling outliers and missing values 2020-04-29 06:27:25 +00:00			`MISSING= bhandler.get_missing(df[col],msize)`
bug fix with partition & data -access 2020-03-08 13:48:38 +00:00
Handling of continous values 2020-02-29 03:37:26 +00:00
bug fix ... 2020-03-12 19:37:01 +00:00
Handling of continous values 2020-02-29 03:37:26 +00:00			`args['values'] = values`
			`args['row_count'] = df.shape[0]`
fix: handling outliers and missing values 2020-04-29 06:27:25 +00:00			`# if col in NO_VALUE :`
			`# args['no_value'] = NO_VALUE[col]`
			`# else:`
			`# args['no_value'] = NO_VALUE`
			`# novalue = NO_VALUE[col] if NO_VALUE[col] in ['na',''] else NO_VALUE[col]`
			`# MISSING += [NO_VALUE[col]]`
			`args['missing'] = MISSING`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`#`
			`# we can determine the cardinalities here so we know what to allow or disallow`
bug fix and upgrades to base functionalities 2020-03-04 17:49:18 +00:00			`handler = gan.Predict (**args)`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`handler.load_meta(col)`
bug fix and upgrades to base functionalities 2020-03-04 17:49:18 +00:00			`r = handler.apply()`
bug fix with ICD and some minor improvements 2020-03-25 22:43:23 +00:00			`if col in CONTINUOUS :`
fix: handling outliers and missing values 2020-04-29 06:27:25 +00:00			`r[col] = np.array(r[col])`
			`_approx = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) #-- approximating based on arbitrary bins`
			`r[col] = _approx`

bug fix with ICD and some minor improvements 2020-03-25 22:43:23 +00:00

bug fixes and optimizations 2020-04-01 05:21:51 +00:00			`_df[col] = r[col]`
Handling of continous values 2020-02-29 03:37:26 +00:00			`#`
fixed issue around data-types/casting misbehavior with pandas and missing values 2020-04-14 06:54:11 +00:00			`# Let's cast the type to the original type (it makes the data more usable)`
			`#`
fix: handling outliers and missing values 2020-04-29 06:27:25 +00:00			`# print (values)`
			`# print ([col,df[col].dtype,_df[col].tolist()])`
fixed issue around data-types/casting misbehavior with pandas and missing values 2020-04-14 06:54:11 +00:00			`otype = df[col].dtype`
			`_df[col] = _df[col].astype(otype)`
fix: handling outliers and missing values 2020-04-29 06:27:25 +00:00
fixed issue around data-types/casting misbehavior with pandas and missing values 2020-04-14 06:54:11 +00:00			`#`
Handling of continous values 2020-02-29 03:37:26 +00:00			`# @TODO: log basic stats about the synthetic attribute`
			`#`
bug fix and upgrades to base functionalities 2020-03-04 17:49:18 +00:00			`# print (r)s`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`# break`
Bug fix with the number of candidates generated 2020-03-14 16:12:13 +00:00
bug fix with number of GPU, columns as identifiers 2020-01-10 15:53:23 +00:00			`return _df`