data-maker/data/maker/__init__.py

"""
(c) 2019 Data Maker, hiplab.mc.vanderbilt.edu
version 1.0.0

This package serves as a proxy to the overall usage of the framework.
This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques

@TODO:
    - Make configurable GPU, EPOCHS
"""
import pandas as pd
import numpy as np
import data.gan as gan
from transport import factory
from data.bridge import Binary
import threading as thread
def train (**args) :
    """
    This function is intended to train the GAN in order to learn about the distribution of the features
    :column     columns that need to be synthesized (discrete)
    :logs       where the output of the (location on disk)
    :id         identifier of the dataset
    :data       data-frame to be synthesized
    :context    label of what we are synthesizing
    """
    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
    
    # column_id   = args['id']
    df          = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
    df.columns = [name.lower() for name in df.columns]

    #
    # If we have several columns we will proceed one at a time (it could be done in separate threads)
    # @TODO : Consider performing this task on several threads/GPUs simulataneously
    # 
    handler = Binary()
    # args['label']      = pd.get_dummies(df[column_id]).astype(np.float32).values
    # args['label']   = handler.Export(df[[column_id]])
    # args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1)
    for col in column :    
        args['real']        = pd.get_dummies(df[col]).astype(np.float32).values 
        # args['real']    = handler.Export(df[[col]])
        args['column']  = col
        args['context'] = col
        context     = args['context']
        if 'store' in args :
            args['store']['args']['doc'] = context
            logger = factory.instance(**args['store'])
            args['logger'] = logger
            
        else:
            logger = None
        trainer = gan.Train(**args)        
        trainer.apply()
def post(**args):
    """
    This uploads the tensorflow checkpoint to a data-store (mongodb, biguqery, s3)
    
    """
    pass
def get(**args):
    """
    This function will restore a checkpoint from a persistant storage on to disk
    """
    pass
def generate(**args):
    """
    This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
    @return pandas.DataFrame

    :data   data-frame to be synthesized
    :column   columns that need to be synthesized (discrete)
    :id     column identifying an entity
    :logs   location on disk where the learnt knowledge of the dataset is
    """
    # df      = args['data']
    df      = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
    
    column      = args['column'] if (isinstance(args['column'],list)) else [args['column']]
    column_id   = args['id']
    #
    #@TODO:
    #   If the identifier is not present, we should fine a way to determine or make one
    #
    # args['label']  = pd.get_dummies(df[column_id]).astype(np.float32).values
    bwrangler = Binary()
    # args['label']   = bwrangler.Export(df[[column_id]])
    _df     = df.copy()
    for col in column :
        args['context'] = col
        args['column']  = col
        values          = df[col].unique().tolist()
        # values.sort()        
        args['values']  = values
        #
        # we can determine the cardinalities here so we know what to allow or disallow
        handler         = gan.Predict (**args)
        handler.load_meta(col)
        r       =  handler.apply()        
        # print (r)        
        _df[col] = r[col]
        # break
    return _df
fixes with the framework - only supports single feature 2020-01-01 05:27:53 +00:00			`"""`
			`(c) 2019 Data Maker, hiplab.mc.vanderbilt.edu`
			`version 1.0.0`

			`This package serves as a proxy to the overall usage of the framework.`
			`This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques`

			`@TODO:`
			`- Make configurable GPU, EPOCHS`
			`"""`
			`import pandas as pd`
			`import numpy as np`
bug fix with imports 2020-01-05 05:02:15 +00:00			`import data.gan as gan`
bug fixes with operations 2020-01-04 03:47:05 +00:00			`from transport import factory`
bug fix with binary matrix generation 2020-02-18 08:59:39 +00:00			`from data.bridge import Binary`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`import threading as thread`
fixes with the framework - only supports single feature 2020-01-01 05:27:53 +00:00			`def train (**args) :`
			`"""`
			`This function is intended to train the GAN in order to learn about the distribution of the features`
			`:column columns that need to be synthesized (discrete)`
			`:logs where the output of the (location on disk)`
			`:id identifier of the dataset`
			`:data data-frame to be synthesized`
			`:context label of what we are synthesizing`
			`"""`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`column = args['column'] if (isinstance(args['column'],list)) else [args['column']]`
fixes with the framework - only supports single feature 2020-01-01 05:27:53 +00:00
removing conditions, it blows up computational space 2020-02-18 18:25:47 +00:00			`# column_id = args['id']`
bug fixes 2020-01-10 19:12:58 +00:00			`df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`df.columns = [name.lower() for name in df.columns]`
fixes with the framework - only supports single feature 2020-01-01 05:27:53 +00:00
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`#`
			`# If we have several columns we will proceed one at a time (it could be done in separate threads)`
			`# @TODO : Consider performing this task on several threads/GPUs simulataneously`
			`#`
bug fix with binary matrix generation 2020-02-18 08:59:39 +00:00			`handler = Binary()`
			`# args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values`
removing conditions, it blows up computational space 2020-02-18 18:25:47 +00:00			`# args['label'] = handler.Export(df[[column_id]])`
			`# args['label'] = np.ones(df.shape[0]).reshape(df.shape[0],1)`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`for col in column :`
bug fix with class hierarchy 2020-02-20 15:52:53 +00:00			`args['real'] = pd.get_dummies(df[col]).astype(np.float32).values`
			`# args['real'] = handler.Export(df[[col]])`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`args['column'] = col`
			`args['context'] = col`
			`context = args['context']`
			`if 'store' in args :`
			`args['store']['args']['doc'] = context`
			`logger = factory.instance(**args['store'])`
			`args['logger'] = logger`

			`else:`
			`logger = None`
			`trainer = gan.Train(**args)`
			`trainer.apply()`
			`def post(**args):`
			`"""`
			`This uploads the tensorflow checkpoint to a data-store (mongodb, biguqery, s3)`

			`"""`
			`pass`
bug fix ... need to design porting/loading models on the fly 2020-02-12 18:43:30 +00:00			`def get(**args):`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`"""`
			`This function will restore a checkpoint from a persistant storage on to disk`
			`"""`
			`pass`
fixes with the framework - only supports single feature 2020-01-01 05:27:53 +00:00			`def generate(**args):`
			`"""`
			`This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset`
			`@return pandas.DataFrame`

			`:data data-frame to be synthesized`
			`:column columns that need to be synthesized (discrete)`
			`:id column identifying an entity`
			`:logs location on disk where the learnt knowledge of the dataset is`
			`"""`
bug fixes 2020-01-10 19:12:58 +00:00			`# df = args['data']`
			`df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00
			`column = args['column'] if (isinstance(args['column'],list)) else [args['column']]`
fixes with the framework - only supports single feature 2020-01-01 05:27:53 +00:00			`column_id = args['id']`
			`#`
			`#@TODO:`
			`# If the identifier is not present, we should fine a way to determine or make one`
			`#`
bug fix with binary matrix generation 2020-02-18 08:59:39 +00:00			`# args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values`
			`bwrangler = Binary()`
removing conditions, it blows up computational space 2020-02-18 18:25:47 +00:00			`# args['label'] = bwrangler.Export(df[[column_id]])`
not sure about the changes (oops) 2020-02-11 18:00:16 +00:00			`_df = df.copy()`
			`for col in column :`
			`args['context'] = col`
			`args['column'] = col`
			`values = df[col].unique().tolist()`
			`# values.sort()`
			`args['values'] = values`
			`#`
			`# we can determine the cardinalities here so we know what to allow or disallow`
			`handler = gan.Predict (**args)`
			`handler.load_meta(col)`
			`r = handler.apply()`
			`# print (r)`
			`_df[col] = r[col]`
			`# break`
bug fix with number of GPU, columns as identifiers 2020-01-10 15:53:23 +00:00			`return _df`