""" (c) 2019 Data Maker, hiplab.mc.vanderbilt.edu version 1.0.0 This package serves as a proxy to the overall usage of the framework. This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques @TODO: - Make configurable GPU, EPOCHS """ import pandas as pd import numpy as np import data.gan as gan from transport import factory import threading as thread def train (**args) : """ This function is intended to train the GAN in order to learn about the distribution of the features :column columns that need to be synthesized (discrete) :logs where the output of the (location on disk) :id identifier of the dataset :data data-frame to be synthesized :context label of what we are synthesizing """ column = args['column'] if (isinstance(args['column'],list)) else [args['column']] column_id = args['id'] df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) df.columns = [name.lower() for name in df.columns] # # If we have several columns we will proceed one at a time (it could be done in separate threads) # @TODO : Consider performing this task on several threads/GPUs simulataneously # args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values for col in column : args['real'] = pd.get_dummies(df[col]).astype(np.float32).values args['column'] = col args['context'] = col context = args['context'] if 'store' in args : args['store']['args']['doc'] = context logger = factory.instance(**args['store']) args['logger'] = logger else: logger = None trainer = gan.Train(**args) trainer.apply() def post(**args): """ This uploads the tensorflow checkpoint to a data-store (mongodb, biguqery, s3) """ pass def get(**args): """ This function will restore a checkpoint from a persistant storage on to disk """ pass def generate(**args): """ This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset @return pandas.DataFrame :data data-frame to be synthesized :column columns that need to be synthesized (discrete) :id column identifying an entity :logs location on disk where the learnt knowledge of the dataset is """ # df = args['data'] df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data']) column = args['column'] if (isinstance(args['column'],list)) else [args['column']] column_id = args['id'] # #@TODO: # If the identifier is not present, we should fine a way to determine or make one # args['label'] = pd.get_dummies(df[column_id]).astype(np.float32).values _df = df.copy() for col in column : args['context'] = col args['column'] = col values = df[col].unique().tolist() # values.sort() args['values'] = values # # we can determine the cardinalities here so we know what to allow or disallow handler = gan.Predict (**args) handler.load_meta(col) r = handler.apply() # print (r) _df[col] = r[col] # break return _df