Merge branch 'release' of aou/data-maker into master

2020-01-04 23:09:10 -06:00 · 2020-01-04 23:09:10 -06:00 · c14b8071e3
parent 9abb1e8166 ef39969082
commit c14b8071e3
10 changed files with 218 additions and 26 deletions
--- a/4
+++ b/4
@ -2,7 +2,7 @@ from ubuntu
 RUN ["apt-get","update"]
 RUN ["apt-get","upgrade","-y"]
 RUN ["apt-get","install","-y","git", "python3-dev","tmux","locales","python3-pip","python3-numpy","python3-pandas","locales"]
-RUN ["pip3","install","pandas-gbq","tensorflow"]
+RUN ["pip3","install","pandas-gbq","tensorflow","git+https://hiplab.mc.vanderbilt.edu/git/aou/"]
 RUN ["mkdir","-p","/usr/apps"]
 WORKDIR /usr/apps
-RUN ["git","clone","https://hiplab.mc.vanderbilt.edu/git/gan.git","aou-gan"]
+RUN ["git","clone","https://hiplab.mc.vanderbilt.edu/git/aou/bridge.git@release","aou-gan"]
--- a/README.md
+++ b/README.md
@ -1,2 +1,63 @@
-# bridge
+## Introduction
 This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques
    - Generative Adversarial Networks
    - With "Earth mover's distance"
 ## Installation
    pip install git+https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git@release
 ## Usage
 After installing the easiest way to get started is as follows (using pandas). The process is as follows:
 **Train the GAN on the original/raw dataset**
    import pandas as pd
    import data.maker
    df      = pd.read_csv('sample.csv')
    column  = 'gender'
    id      = 'id' 
    context = 'demo'
    data.maker.train(context=context,data=df,column=column,id=id,logs='logs')
 The trainer will store the data on disk (for now) in a structured folder that will hold training models that will be used to generate the synthetic data.
 **Generate a candidate dataset from the learned features**
    import pandas as pd
    import data.maker
    df  = pd.read_csv('sample.csv')
    id  = 'id'
    column = 'gender'
    context = 'demo'
    data.maker.generate(context=context,data=df,id=id,column=column,logs='logs')
 ## Limitations
 GANS will generate data assuming the original data has all the value space needed:
 - No new data will be created
        Assuming we have a dataset with an gender attribute with values [M,F]. 
        The synthetic data will not be able to generate genders outside [M,F]
 - Not advised on continuous values
        GANS work well on discrete values and thus are not advised to be used.
        e.g:measurements (height, blood pressure, ...)
 - For now will only perform on a single feature.
 ## Credits :
 - [Ziqi Zhang](ziqi.zhang@vanderbilt.edu)
 - [Brad Malin](b.malin@vanderbilt.edu)
 - [Steve L. Nyemba](steve.l.nyemba@vanderbilt.edu)
--- a/data/WGAN.py
+++ b/data/WGAN.py
--- a/data/init.py
+++ b/data/init.py
@ -0,0 +1,2 @@
 import data.params as params
--- a/data/bridge.py
+++ b/data/bridge.py
--- a/data/gan.py
+++ b/data/gan.py
@ -11,9 +11,10 @@ import pandas as pd
 import time
 import os
 import sys
-from params import SYS_ARGS
+from data.params import SYS_ARGS
-from bridge import Binary
+from data.bridge import Binary
 import json
 import pickle
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ['CUDA_VISIBLE_DEVICES'] = "0"
@ -37,10 +38,8 @@ class GNet :
        self.layers = void()
        self.layers.normalize = self.normalize
        self.get = void()
        self.get.variables = self._variable_on_cpu
-        self.NUM_GPUS = 1
+        self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
        self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854
@ -64,6 +63,10 @@ class GNet :
        self._REAL = args['real'] if 'real' in args else None
        self._LABEL = args['label'] if 'label' in args else None
        self.get = void()
        self.get.variables = self._variable_on_cpu
        self.get.suffix = lambda : "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
        self.logger = args['logger'] if 'logger' in args and args['logger'] else None
        self.init_logs(**args)
    def init_logs(self,**args):
@ -83,7 +86,9 @@ class GNet :
        This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model.
        Because prediction and training can happen independently
        """
-        _name = os.sep.join([self.out_dir,'meta-'+column+'.json'])
+        # suffix = "-".join(column) if isinstance(column,list)else column
        suffix = self.get.suffix()
        _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json'])
        if os.path.exists(_name) :
            attr = json.loads((open(_name)).read())
            for key in attr :
@ -94,7 +99,7 @@ class GNet :
    def log_meta(self,**args) :
-        object = {
+        _object = {
            'CONTEXT':self.CONTEXT,
            'ATTRIBUTES':self.ATTRIBUTES,
            'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,
@ -111,9 +116,13 @@ class GNet :
            key = args['key']
            value= args['value']
            object[key] = value
-        _name = os.sep.join([self.out_dir,'meta-'+SYS_ARGS['column']])
+        # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column
        suffix = self.get.suffix()
        _name = os.sep.join([self.out_dir,'meta-'+suffix])
        f = open(_name+'.json','w')
-        f.write(json.dumps(object))
+        f.write(json.dumps(_object))
        return _object
    def mkdir (self,path):
        if not os.path.exists(path) :
            os.mkdir(path)        
@ -285,8 +294,10 @@ class Train (GNet):
        self.discriminator = Discriminator(**args)
        self._REAL = args['real']
        self._LABEL= args['label']
        self.column = args['column']
        # print ([" *** ",self.BATCHSIZE_PER_GPU])
-        self.log_meta()
+        
        self.meta = self.log_meta()
    def load_meta(self, column):
        """
        This function will delegate the calls to load meta data to it's dependents
@ -384,7 +395,7 @@ class Train (GNet):
            # saver = tf.train.Saver()
            saver   = tf.compat.v1.train.Saver()
            init    = tf.global_variables_initializer()
-
+            logs = []
            with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
                sess.run(init)
                sess.run(iterator_d.initializer,
@ -406,13 +417,22 @@ class Train (GNet):
                    format_str = 'epoch: %d, w_distance = %f (%.1f)'
                    print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
-                    if epoch % self.MAX_EPOCHS == 0:
+                    # print (dir (w_distance))
-                        _name  = os.sep.join([self.train_dir,self.ATTRIBUTES['synthetic']])
+                    logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })
                    if epoch % self.MAX_EPOCHS == 0:
                        # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
                        suffix = self.get.suffix()
                        _name  = os.sep.join([self.train_dir,suffix])
                        # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
                        saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
                        #
                        #
                        if self.logger :
                            row = {"logs":logs} #,"model":pickle.dump(sess)}
                            self.logger.write(row=row)
 class Predict(GNet):
    """
@ -421,13 +441,15 @@ class Predict(GNet):
    def __init__(self,**args):
        GNet.__init__(self,**args)        
        self.generator = Generator(**args)        
-        self.values  = values
+        self.values  = args['values']
    def load_meta(self, column):
        super().load_meta(column)
        self.generator.load_meta(column)
    def apply(self,**args):
        # print (self.train_dir)
-        model_dir = os.sep.join([self.train_dir,self.ATTRIBUTES['synthetic']+'-'+str(self.MAX_EPOCHS)])
+        # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
        suffix = self.get.suffix()
        model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
        demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
        tf.compat.v1.reset_default_graph()
        z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
@ -450,19 +472,24 @@ class Predict(GNet):
            # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
            #
-            df =  ( pd.DataFrame(np.round(f).astype(np.int32),columns=values))
+            df =  ( pd.DataFrame(np.round(f).astype(np.int32)))
            # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
            # df = (i * df).sum(axis=1)
            #
            # In case we are dealing with actual values like diagnosis codes we can perform 
            #
-            r = np.zeros((self.ROW_COUNT,1))
+            columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
            r = np.zeros((self.ROW_COUNT,len(columns)))
            for col in df :
                i = np.where(df[col])[0]
                r[i] = col
            df = pd.DataFrame(r,columns=[self.ATTRIBUTES['synthetic']])
-            return df.to_dict(orient='list')
+            df = pd.DataFrame(r,columns=columns)
            df[df.columns] = (df.apply(lambda value: self.values[ int(value)],axis=1))
            return df.to_dict(orient='lists')
            # return df.to_dict(orient='list')
            # count = str(len(os.listdir(self.out_dir)))
            # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
            # df.to_csv(_name,index=False)
@ -476,7 +503,7 @@ class Predict(GNet):
            #         idx2 = (demo[:, n] == 1)
            #         idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
            #         num = np.sum(idx)
-            #         print ("_____________________")
+            #         print ("___________________list__")
            #         print (idx1)
            #         print (idx2)
            #         print (idx)
@ -531,7 +558,8 @@ if __name__ == '__main__' :
    elif 'generate' in SYS_ARGS:
        values = df[column].unique().tolist()
        values.sort()
-        p = Predict(context=context,label=LABEL,values=values)
+        
        p = Predict(context=context,label=LABEL,values=values,column=column)
        p.load_meta(column)
        r = p.apply()
        print (df)
@ -539,6 +567,7 @@ if __name__ == '__main__' :
        df[column] = r[column]
        print (df)
    else:
        print (SYS_ARGS.keys())
        print (__doc__)
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -0,0 +1,75 @@
 """
 (c) 2019 Data Maker, hiplab.mc.vanderbilt.edu
 version 1.0.0
 This package serves as a proxy to the overall usage of the framework.
 This package is designed to generate synthetic data from a dataset from an original dataset using deep learning techniques
@TODO:
    - Make configurable GPU, EPOCHS
 """
 import pandas as pd
 import numpy as np
 import data.gan as gan
 from transport import factory
 def train (**args) :
    """
    This function is intended to train the GAN in order to learn about the distribution of the features
    :column     columns that need to be synthesized (discrete)
    :logs       where the output of the (location on disk)
    :id         identifier of the dataset
    :data       data-frame to be synthesized
    :context    label of what we are synthesizing
    """
    column      = args['column']
    column_id   = args['id']
    df          = args['data']
    logs        = args['logs']
    real        = pd.get_dummies(df[column]).astype(np.float32).values
    labels      = pd.get_dummies(df[column_id]).astype(np.float32).values
    max_epochs  = 10 if 'max_epochs' not in args else args['max_epochs']
    context     = args['context']
    if 'store' in args :
        args['store']['args']['doc'] = context
        logger = factory.instance(**args['store'])
    else:
        logger = None
    trainer     = gan.Train(context=context,max_epochs=max_epochs,real=real,label=labels,column=column,column_id=column_id,logger = logger,logs=logs)
    return trainer.apply()
 def generate(**args):
    """
    This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
    @return pandas.DataFrame
    :data   data-frame to be synthesized
    :column   columns that need to be synthesized (discrete)
    :id     column identifying an entity
    :logs   location on disk where the learnt knowledge of the dataset is
    """
    df      = args['data']
    column      = args['column'] 
    column_id   = args['id']
    logs        = args['logs']
    context = args['context']
    #
    #@TODO:
    #   If the identifier is not present, we should fine a way to determine or make one
    #
    #ocolumns= list(set(df.columns.tolist())- set(columns))
    values = df[column].unique().tolist()
    values.sort()
    labels = pd.get_dummies(df[column_id]).astype(np.float32).values
    handler = gan.Predict (context=context,label=labels,values=values,column=column)
    handler.load_meta(column)
    r =  handler.apply()
    _df = df.copy()
    _df[column] = r[column]
    return _df
--- a/data/maker/main.py
+++ b/data/maker/main.py
@ -0,0 +1,10 @@
 import pandas as pd
 import data.maker
 df      = pd.read_csv('sample.csv')
 column  = 'gender'
 id      = 'id' 
 context = 'demo'
 store = {"type":"mongo.MongoWriter","args":{"host":"localhost:27017","dbname":"GAN"}}
 max_epochs = 11
 data.maker.train(store=store,max_epochs=max_epochs,context=context,data=df,column=column,id=id,logs='foo')
--- a/data/params.py
+++ b/data/params.py
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,15 @@
 from setuptools import setup, find_packages
 import os
 import sys
 def read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker","version":"1.0.0","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
        "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','numpy','pandas','pandas-gbq','pymongo']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/gan.git'
 if sys.version_info[0] == 2 :
    args['use_2to3'] = False
    args['use_2to3_exclude_fixers'] = ['lib2to3.fixes.fix_import']
 setup(**args)