Merge branch 'dev' of aou/data-maker into master

2022-11-21 21:04:47 +00:00 · 2022-11-21 21:04:47 +00:00 · 59d6cc50c0
parent 76e84c3859 b2cf5ead53
commit 59d6cc50c0
17 changed files with 3436 additions and 656 deletions
--- a/README.md
+++ b/README.md
@ -13,17 +13,19 @@ This package is designed to generate synthetic data from a dataset from an origi

 After installing the easiest way to get started is as follows (using pandas). The process is as follows:

+Read about [data-transport on github](https://github.com/lnyemba/data-transport) or on [healthcareio.the-phi.com/git/code/transport](https://healthcareio.the-phi.com/git/code/transport.git)
+
 **Train the GAN on the original/raw dataset**

+1. We define the data sources
+
+The sources will consists in source, target and logger20.

    import pandas as pd
    import data.maker
+    import transport
+    from transport import providers

-    df      = pd.read_csv('sample.csv')
-    column  = 'gender'
-    id      = 'id' 
-    context = 'demo'
-    data.maker.train(context=context,data=df,column=column,id=id,logs='logs')

 The trainer will store the data on disk (for now) in a structured folder that will hold training models that will be used to generate the synthetic data.

--- a/bin/data-maker
+++ b/bin/data-maker
@ -0,0 +1 @@
+pipeline.py
--- a/binder.py
+++ b/binder.py
@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+"""
+This file will perform basic tasks to finalize the GAN process by performing the following :
+    - basic stats & analytics
+    - rebuild io to another dataset
+"""
+import pandas as pd
+import numpy as np
+from multiprocessing import Process, Lock
+from google.oauth2 import service_account
+from google.cloud import bigquery as bq
+import transport
+from data.params import SYS_ARGS 
+import json
+
+import pandas as pd
+import numpy as np
+from google.oauth2 import service_account
+import json
+
+# path = '../curation-prod.json'
+# credentials = service_account.Credentials.from_service_account_file(path)
+# df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
+filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
+f = open(filename)
+config = json.loads(f.read())
+args = config['pipeline']
+f.close()
+
+def _formatSQL(**_args):
+    """
+    This function will build the _map for a given segment
+    """
+    sql = """
+    select DISTINCT x.person_id synthetic,y.person_id original 
+    FROM :synthetic.:table x 
+    INNER JOIN :original.:table y on x.person_id in (:ids)
+    AND x.person_id <> y.person_id AND x.gender_source_value = y.gender_source_value 
+    AND x.year_of_birth = y.year_of_birth 
+    ORDER BY 1
+    """
+    table= _args['table']
+    original,synthetic = _args['schema']['original'],_args['schema']['synthetic']
+    _ids = np.array(_args['ids']).astype(str)
+    return sql.replace(":ids",",".join(_ids)).replace(":synthetic",synthetic).replace(":original",original).replace(":table",table)
+def _addCounts(**_args) :
+    store   = _args['store']
+    sql     = _args['sql']
+    reader = transport.factory.instance(**store['source'])
+    _df = reader.read(sql=sql)
+    _ids = _df.synthetic.unique()
+    _counts = [ np.sum(_df.synthetic == value) for value in _ids]
+    original = [_df[_df.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids]
+    _df = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts})
+
+    #
+    # We can post this to the backend ...
+    #
+    table = '_map'  #-- Yes this is hard-coded
+    writer = transport.factory.instance(**dict(store['target'],**{"parallel":True,"table":table}))
+    # if writer.has(table=table) is False:
+    #     writer.write(_df)
+    # else:
+    _schema = [{"name":name,"type":"INTEGER"} for name in _df.columns]
+    writer.write(_df,schema=_schema)
+    
+        
+    
+    
+
+def Init(**_args) :
+    """
+    This function will build a map of the synthetic to real individuals. 
+    The assumption is that the synthesized data is stored in the same data-store as the original the parameters provided are :
+    :param  store       object from the configuration file with source,target entries
+    :param  table       name of the original/synthetic tables (they should be the same)
+    :param  feat.       featuress/attributes ... demographics to account for
+    """
+    store = _args['store']
+    reader = transport.factory.instance(**store['source'])
+    original,synthetic = _args['schema']['original'],_args['schema']['synthetic']
+    table = _args['table']
+    sql = _args['sql'].replace(':synthetic',synthetic).replace(':original',original).replace(':table',table)
+   
+    _map = reader.read(sql=sql)
+    
+    
+    
+    k = _args['k'] if 'k' in _args else 2
+    # _iodf = reader.read(table=table)
+    # _ids = _iodf['person_id'].unique().tolist()
+    # x_  = np.array_split(_ids,1000)
+    jobs = []
+    # for _items in x_ :
+    #     _p = {"ids":_items,"schema":_args['schema'],'store':store,'table':table}
+    #     sql = _formatSQL(**_p)
+    #     _p['sql'] = sql
+    #     _apply = lambda params: _addCounts(**params)
+    #     thread = Process(target=_apply,args=(_p,))
+    #     thread.start()
+    #     jobs.append(thread)
+
+    # return jobs
+    #
+    # We have performed a m:m (many-to-many) relationship with original participants and synthetic participants
+    # The goal is to obtain a singular map against which records will be migrated
+    #
+    print (['... computing counts (k)'])
+    _ids = _map.synthetic.unique()
+    _counts = [ np.sum(_map.synthetic == value) for value in _ids]
+    original = [_map[_map.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids]
+    print (['Building k-classes/groups'])
+    _mdf = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts})
+    i = _mdf.apply(lambda row: row.counts >= k,axis=1)
+    _mdf = _mdf[i]
+    #
+    # Log what just happened here so we know about the equivalence classes, 
+    # {"module":"binder","action":"map-generation","input":{"k":k,"rows":{"synthetic":_mdf.shape[0],"original":len(_counts)}}}
+    
+    return _mdf
+    #
+    # now we are posting this to target storage ...
+    #
+def ApplyOn (**_args):
+    """
+    This  function will rewrite SQL that applies the synthetic identifier to the entries of the pipeline
+    We assume that the _map has two attributes (synthetic and original)
+    :param  store
+    :param  _config
+    """
+    store_args = _args['store']
+    _config = _args['config']
+    
+    table = _config['from']
+    reader  = transport.factory.instance(**dict(store_args['source'],**{"table":table}))
+    attr = reader.read(limit=1).columns.tolist()
+    original_key = _args['original_key'] #-- assuming referential integrity
+    
+    # synthetic_key= columns['synthetic']
+    # mapped_original=columns['orginal']
+    fields = list(set(attr) - set([original_key]))
+    sql = "select _map.synthetic as :original_key,:fields from :original_schema.:table inner join :synthetic_schema._map on _map.original = :table.:original_key"
+    sql = sql.replace(":table",table).replace(":fields",",".join(fields))
+    sql = sql.replace(":original_key",original_key)
+    _schema = _args['schema']
+    sql = sql.replace(":original_schema",_schema['original']).replace(":synthetic_schema",_schema['synthetic'])
+    
+    return reader.read (sql=sql)
+   
+if __name__ == '__main__' :
+    pass
+
+# class Analytics :
+#     """
+#     This class will compile basic analytics about a given dataset i.e compare original/synthetic
+#     """
+#     @staticmethod
+#     def distribution(**args):
+#         context = args['context']
+#         df = args['data']
+#         #
+#         #-- This data frame counts unique values for each feature (space)
+#         df_counts = pd.DataFrame(df.apply(lambda col: col.unique().size),columns=['counts']).T  # unique counts
+#         #
+#         #-- Get the distributions for common values
+#         #
+#         names   = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False]
+#         ddf     = df.apply(lambda col: pd.DataFrame(col.values,columns=[col.name]).groupby([col.name]).size() ).fillna(0)
+#         ddf[context] = ddf.index
+          
+#         pass
+#     def distance(**args):
+#         """
+#         This function will measure the distance between 
+#         """
+#         pass
+# class Utils :
+#     @staticmethod
+#     def log(**args):
+#         logger = transport.factory.instance(type="mongo.MongoWriter",args={"dbname":"aou","doc":"logs"})        
+#         logger.write(args)
+#         logger.close()
+#     class get :
+#         @staticmethod
+#         def pipeline(table,path) :
+#             # contexts    = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
+#             config = json.loads((open(path)).read())
+#             pipeline    = config['pipeline']
+#             # return [ item for item in pipeline if item['context'] in contexts]
+#             pipeline =  [item for item in pipeline if 'from' in item and item['from'].strip() == table]
+#             Utils.log(module=table,action='init',input={"pipeline":pipeline})
+#             return pipeline
+#         @staticmethod
+#         def sql(**args) :
+#             """
+#             This function is intended to build SQL query for the remainder of the table that was not synthesized
+#             :config configuration entries
+#             :from   source of the table name
+#             :dataset    name of the source dataset
+            
+#             """
+#             SQL = ["SELECT * FROM :from "]
+#             SQL_FILTER = []
+#             NO_FILTERS_FOUND = True
+#             # pipeline = Utils.get.config(**args)
+#             pipeline = args['pipeline']
+#             REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='}
+#             for item in pipeline :
+                
+
+#                 if 'filter' in item :
+#                     if NO_FILTERS_FOUND :
+#                         NO_FILTERS_FOUND = False
+#                         SQL  += ['WHERE']
+#                     #
+#                     # Let us load the filter in the SQL Query
+#                     FILTER = item['filter']
+#                     QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()]
+#                     SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')']).replace(":dataset",args['dataset'])]
+#             src = ".".join([args['dataset'],args['from']])
+#             SQL += [" AND ".join(SQL_FILTER)]
+#             #
+#             # let's pull the field schemas out of the table definition
+#             #
+#             Utils.log(module=args['from'],action='sql',input={"sql":" ".join(SQL) })
+#             return " ".join(SQL).replace(":from",src)
+
+        
+# def mk(**args) :
+#     dataset  = args['dataset']
+#     client  = args['client'] if 'client' in args else bq.Client.from_service_account_file(args['private_key'])
+#     #
+#     # let us see if we have a dataset handy here 
+#     #
+#     datasets = list(client.list_datasets())
+#     found = [item for item in datasets if item.dataset_id == dataset]
+    
+#     if not found :
+
+#         return client.create_dataset(dataset)
+#     return found[0] 
+        
+# def move (args):
+#     """
+#     This function will move a table from the synthetic dataset into a designated location
+#     This is the simplest case for finalizing a synthetic data set
+#     :private_key        
+#     """
+#     pipeline   = Utils.get.pipeline(args['from'],args['config'])
+#     _args = json.loads((open(args['config'])).read())
+#     _args['pipeline'] = pipeline
+#     # del _args['pipeline']
+#     args = dict(args,**_args)
+#     # del args['pipeline']
+#     # private_key = args['private_key']
+#     client      = bq.Client.from_service_account_json(args['private_key'])
+
+#     dataset     = args['dataset']
+#     if pipeline :
+#         SQL         = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in pipeline]
+#         SQL         += [Utils.get.sql(**args)]
+#         SQL         =  ('\n UNION ALL \n'.join(SQL).replace(':dataset','io'))
+#     else:
+#         #
+#         # moving a table to a designated location
+#         tablename = args['from']
+#         if 'sql' not in args :
+#             SQL = "SELECT * FROM :dataset.:table"
+#         else:
+#             SQL = args['sql']
+#         SQL = SQL.replace(":dataset",dataset).replace(":table",tablename)
+#     Utils.log(module=args['from'],action='sql',input={'sql':SQL})
+#     #
+#     # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
+#     #
+    
+    
+
+#     odataset    = mk(dataset=dataset+'_io',client=client)
+#     # SQL =       "SELECT * FROM io.:context_full_io".replace(':context',context)
+#     config = bq.QueryJobConfig()
+#     config.destination = client.dataset(odataset.dataset_id).table(args['from'])
+#     config.use_query_cache = True
+#     config.allow_large_results = True
+#     config.priority = 'INTERACTIVE'
+#     #
+#     #
+
+#     schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
+#     fields = [" ".join(["CAST (",item.name,"AS",item.field_type.replace("INTEGER","INT64").replace("FLOAT","FLOAT64"),") ",item.name]) for item in schema]
+#     SQL = SQL.replace("*"," , ".join(fields))
+#     # print (SQL)
+#     out = client.query(SQL,location='US',job_config=config)
+#     Utils.log(module=args['from'],action='move',input={'job':out.job_id})
+#     return (out.job_id)
+    
+    
+
+
+# import pandas as pd
+# import numpy as np
+# from google.oauth2 import service_account
+# import json
+
+# # path = '../curation-prod.json'
+# # credentials = service_account.Credentials.from_service_account_file(path)
+# # df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
+# filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
+# f = open(filename)
+# config = json.loads(f.read())
+# args = config['pipeline']
+# f.close()
+
+
+# if __name__ == '__main__' :
+#     """
+#     Usage :
+#         finalize --<move|stats> --contexts <c1,c2,...c3> --from <table>
+#     """
+    
+#     if 'move' in SYS_ARGS :
+
+#         if 'init' in SYS_ARGS :
+#             dep = config['dep'] if 'dep' in config else {}
+#             info = []
+            
+#             if 'queries' in dep :
+#                 info += dep['queries']
+#                 print ('________')
+#             if 'tables' in dep :
+#                 info += dep['tables']
+#             args = {}
+#             jobs = []
+#             for item in info :
+#                 args = {}
+#                 if type(item) == str :
+#                     args['from'] = item
+#                     name = item
+#                 else:
+#                     args = item
+#                     name = item['from']
+#                 args['config'] = SYS_ARGS['config']
+#                 # args['pipeline'] = []
+#                 job = Process(target=move,args=(args,))
+#                 job.name = name
+#                 jobs.append(job)
+#                 job.start()
+                
+
+#             # while len(jobs) > 0 :
+#             #     jobs = [job for job in jobs if job.is_alive()]
+#             #     time.sleep(1)
+            
+
+#         else:
+#             move(SYS_ARGS)
+#         # # table = SYS_ARGS['from']
+#         # # args = dict(config,**{"private_key":"../curation-prod.json"})
+#         # args = dict(args,**SYS_ARGS)        
+#         # contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
+#         # log = []
+#         # if contexts :
+#         #     args['contexts'] = contexts
+#         #     log = move(**args)
+            
+#         # else:
+#         #     tables = args['from'].split(',')
+#         #     for name in tables :
+#         #         name = name.strip()
+#         #         args['from'] = name
+#         #         log += [move(**args)]
+#         # print ("\n".join(log))
+        
+        
+        
+#     else:
+#         print ("NOT YET READY !")
--- a/data/init.py
+++ b/data/init.py
@ -1,2 +1,6 @@
-import data.params as params
-
+# import data.params as params
+from data.params import SYS_ARGS
+import transport
+from multiprocessing import Process, Queue
+from data.maker import prepare
+from data.maker import state
--- a/data/bridge.py
+++ b/data/bridge.py
@ -153,21 +153,28 @@ class Binary :
    """
    This is a utility class to import and export a data to/from a binary matrix
    """
-    def __stream(self,column) :
+    def __stream(self,column,size=-1) :
        """
        This function will convert a column into a binary matrix with the value-space representing each column of the resulting matrix        
        :column a column vector i.e every item is a row
        """
        # values = np.unique(column)    
        
-        values = column.dropna().unique() 
-        values.sort()
+        # values = column.dropna().unique() 
+        
+        # values.sort()
+        # column = column.values
+        values = self.get_column(column,size)
+        column = column.values
        #
        # Let's treat the case of missing values i.e nulls 
        #       
        row_count,col_count = column.size,values.size
+        # if row_count * col_count > size and row_count < size:
            
-        matrix = [ np.zeros(col_count) for i in np.arange(row_count)]
+
+       
+        matrix = [ np.zeros(col_count,dtype=np.float32) for i in np.arange(row_count)]
        #
        # let's create a binary matrix of the feature that was passed in
        # The indices of the matrix are inspired by classical x,y axis 
@ -176,14 +183,52 @@ class Binary :
            
            for yi in np.arange(row_count) :
                value   = column[yi]
-                if value not in values :
-                    continue
+                # if value not in values :
+                #     continue
                xi = np.where(values == value)    
+                if xi and xi[0].size > 0:         
                    xi      = xi[0][0] #-- column index            
                    matrix[yi][xi] = 1
        
-        return matrix
-    def Export(self,df) :
+        return pd.DataFrame(matrix,columns=values)
+    def apply(self,column,size):
+        return self.__stream(column,size)
+    def get_column(self,column,size=-1):
+        """
+        This function will return the columns that are available for processing ...
+        """
+        values = column.dropna().value_counts().index.values
+        
+        if size > 0 and column.size > size:
+            values = values[:size]
+        values.sort()
+        return values
+    def get_missing(self,column,size=-1):
+        values = column.dropna().value_counts().index.values
+        if size > 0 and column.size > size :
+            values = values[size:]
+        else:
+            values = np.array([])
+        values.sort()
+        return values.tolist();
+        
+    def _get_column_values(self,column,size=-1):
+        values = column.dropna().unique() 
+        values.sort()
+        
+        #
+        # Let's treat the case of missing values i.e nulls 
+        #       
+        row_count,col_count = column.size,values.size
+        if col_count > size and size  > 0:
+            # N = np.divide(size,row_count).astype(int) 
+            # N = 
+            i = np.random.choice(col_count,size)
+            values = values[-i]
+            col_count = size
+        return values
+ 
+    def _Export(self,df) :
        """
        This function will convert a data-frame to a binary matrix
        :return _map,matrix
@ -191,12 +236,14 @@ class Binary :
        #
        # This will give us a map of how each column was mapped to a bitstream
        
-        _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
+        # _map = df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0)
+        # _map = df.fillna(np.nan).apply(lambda column: column,axis=0)
        
+        print (df.fillna(np.nan).apply(lambda column: self.__stream(column),axis=0))
        #
        # We will merge this to have a healthy matrix
        _matrix =  _map.apply(lambda row: list(list(itertools.chain(*row.values.tolist()))),axis=1)
-        _matrix = np.matrix([list(item) for item in _matrix])
+        _matrix = np.matrix([list(item) for item in _matrix]).astype(np.float32)
        #
        # let's format the map so we don't have an unreasonable amount of data
        #
@ -210,7 +257,8 @@ class Binary :
            _m[name] = {"start":beg,"end":end}
            beg = end

-        return _m,_matrix.astype(np.float32)        
+        # return _m,_matrix.astype(np.float32)        
+        return _matrix
        
    def Import(self,df,values,_map):
        """
@ -237,37 +285,41 @@ if __name__ == '__main__' :
        --pseudo    will create pseudonyms for a given
        --export    will export data to a specified location
    """
-    has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
-    has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
-    if has_basic and has_action :
-        builder = Builder()
-        if 'export' in SYS_ARGS :
-            print ()
-            print ("exporting ....")
-            if not os.path.exists(SYS_ARGS['export']) :
-                os.mkdir(SYS_ARGS['export'])
-            SQL = builder.encode(**SYS_ARGS)
-            #
-            # Assuming the user wants to filter the records returned :
-            #
+    df = pd.read_csv('sample.csv')
+    print ( df.race.value_counts())
+    print ( (Binary()).apply(df['race'], 3))

-            credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key'])
-            df  = pd.read_gbq(SQL,credentials =credentials,dialect='standard')
-            FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv'])
-            #
-            # This would allow us to export it to wherever we see fit
-            print (FILENAME)
-            df.to_csv(FILENAME,index=False)
-            f = open(FILENAME.replace('.csv','.sql'),'w+')
-            f.write(SQL)
-            f.close()
-        elif 'pseudo' in SYS_ARGS :
-            builder.process(**SYS_ARGS)
-    else:
-        print ("")
-        print (SYS_ARGS.keys())
-        print ("has basic ",has_basic)
-        print ("has action ",has_action)
+    # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
+    # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
+    # if has_basic and has_action :
+    #     builder = Builder()
+    #     if 'export' in SYS_ARGS :
+    #         print ()
+    #         print ("exporting ....")
+    #         if not os.path.exists(SYS_ARGS['export']) :
+    #             os.mkdir(SYS_ARGS['export'])
+    #         SQL = builder.encode(**SYS_ARGS)
+    #         #
+    #         # Assuming the user wants to filter the records returned :
+    #         #
+            
+    #         credentials = service_account.Credentials.from_service_account_file(SYS_ARGS['key'])
+    #         df  = pd.read_gbq(SQL,credentials =credentials,dialect='standard')
+    #         FILENAME = os.sep.join([SYS_ARGS['export'],SYS_ARGS['table']+'.csv'])
+    #         #
+    #         # This would allow us to export it to wherever we see fit
+    #         print (FILENAME)
+    #         df.to_csv(FILENAME,index=False)
+    #         f = open(FILENAME.replace('.csv','.sql'),'w+')
+    #         f.write(SQL)
+    #         f.close()
+    #     elif 'pseudo' in SYS_ARGS :
+    #         builder.process(**SYS_ARGS)
+    # else:
+    #     print ("")
+    #     print (SYS_ARGS.keys())
+    #     print ("has basic ",has_basic)
+    #     print ("has action ",has_action)
 # pseudonym.apply(table='person',dataset='wgan_original',key='./curation-test-2.json')        
 # args = {"dataset":"wgan_original","table":"observation","key":"./curation-test-2.json"}
 # builder = Builder()
--- a/data/gan.py
+++ b/data/gan.py
@ -1,11 +1,28 @@
 """
-usage :
-    optional :
-    --num_gpu   number of gpus to use will default to 1
-    --epoch     steps per epoch default to 256
+This code was originally writen by Ziqi Zhang <ziqi.zhang@vanderbilt.edu> in order to generate synthetic data.
+The code is an implementation of a Generative Adversarial Network that uses the Wasserstein Distance (WGAN).
+It is intended to be used in 2 modes (embedded in code or using CLI)
+
+USAGE :
+
+The following parameters should be provided in a configuration file (JSON format)
+python data/maker --config <path-to-config-file.json>
+
+CONFIGURATION FILE STRUCTURE :
+
+        context         what it is you are loading (stroke, hypertension, ...)
+        data            path of the file to be loaded
+        logs            folder to store training model and meta data about learning
+        max_epochs      number of iterations in learning 
+        num_gpu         number of gpus to be used (will still run if the GPUs are not available)
+
+EMBEDDED IN CODE :
+
 """
 import tensorflow as tf
-from tensorflow.contrib.layers import l2_regularizer
+# from tensorflow.contrib.layers import l2_regularizer
+from tensorflow.keras import layers
+from tensorflow.keras.regularizers  import L2 as l2_regularizer
 import numpy as np
 import pandas as pd
 import time
@ -19,7 +36,7 @@ import pickle
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ['CUDA_VISIBLE_DEVICES'] = "0"
 os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
-
+tf.compat.v1.disable_eager_execution()
 # STEPS_PER_EPOCH         = int(SYS_ARGS['epoch']) if 'epoch' in SYS_ARGS else 256
 # NUM_GPUS                        = 1 if 'num_gpu' not in SYS_ARGS else int(SYS_ARGS['num_gpu'])
 # BATCHSIZE_PER_GPU   = 2000
@ -28,6 +45,10 @@ os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
 class void :
        pass
 class GNet :
+        def log(self,**args):
+                self.logs = dict(args,**self.logs)
+           
+                        
        """
        This is the base class of a generative network functions, the details will be implemented in the subclasses.
        An instance of this class is accessed as follows 
@ -37,26 +58,56 @@ class GNet :
        def __init__(self,**args):
                self.layers = void()
                self.layers.normalize = self.normalize
+                self.logs = {}

+                # self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
+                self.GPU_CHIPS = None if 'gpu' not in args else [args['gpu']]
+                if self.GPU_CHIPS is None:
+                        self.GPU_CHIPS = [0]
+                        if 'CUDA_VISIBLE_DEVICES' in os.environ :
+                                os.environ.pop('CUDA_VISIBLE_DEVICES')
+                        self.NUM_GPUS = 0
+                else:
+                        self.NUM_GPUS = len(self.GPU_CHIPS)
+                        # os.environ['CUDA_VISIBLE_DEVICES'] = str(self.GPU_CHIPS[0])
                
-        self.NUM_GPUS = 1 if 'num_gpu' not in args else args['num_gpu']
-       
+                self.PARTITION = args['partition'] if 'partition' in args else None
+                # if self.NUM_GPUS > 1 :
+                #     os.environ['CUDA_VISIBLE_DEVICES'] = "4"

                self.X_SPACE_SIZE = args['real'].shape[1] if 'real' in args else 854
                self.G_STRUCTURE = [128,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE]
                self.D_STRUCTURE = [self.X_SPACE_SIZE,256,128] #[self.X_SPACE_SIZE, self.X_SPACE_SIZE*2, self.X_SPACE_SIZE] #-- change 854 to number of diagnosis
                # self.NUM_LABELS         = 8 if 'label' not in args elif len(args['label'].shape) args['label'].shape[1]
+                
                if 'label' in args and len(args['label'].shape) == 2 :
                        self.NUM_LABELS = args['label'].shape[1]
                elif 'label' in args and len(args['label']) == 1 :
                        self.NUM_LABELS = args['label'].shape[0]
                else:
-            self.NUM_LABELS = 8
-        self.Z_DIM = 128 #self.X_SPACE_SIZE     
-        self.BATCHSIZE_PER_GPU = args['real'].shape[0] if 'real' in args else 256
+                        self.NUM_LABELS = None
+                # self.Z_DIM = 128 #self.X_SPACE_SIZE     
+                self.Z_DIM = 128  #-- used as rows down stream
+                self.G_STRUCTURE = [self.Z_DIM,self.Z_DIM]
+                PROPOSED_BATCH_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size'])
+                self.BATCHSIZE_PER_GPU = PROPOSED_BATCH_PER_GPU
+                if 'real' in args : 
+                        self.D_STRUCTURE = [args['real'].shape[1],256,self.Z_DIM]
+                
+                        if args['real'].shape[0]  < PROPOSED_BATCH_PER_GPU :
+                                self.BATCHSIZE_PER_GPU = int(args['real'].shape[0]* 1) 
+                # self.BATCHSIZE_PER_GPU = 2000 if 'batch_size' not in args else int(args['batch_size'])
                self.TOTAL_BATCHSIZE = self.BATCHSIZE_PER_GPU * self.NUM_GPUS
                self.STEPS_PER_EPOCH = 256 #int(np.load('ICD9/train.npy').shape[0] / 2000)       
                self.MAX_EPOCHS = 10 if 'max_epochs' not in args else int(args['max_epochs'])
+                CHECKPOINT_SKIPS = int(args['checkpoint_skips']) if 'checkpoint_skips' in args else int(self.MAX_EPOCHS/10)
+                
+                CHECKPOINT_SKIPS = 1 if CHECKPOINT_SKIPS < 1 else CHECKPOINT_SKIPS
+                # if self.MAX_EPOCHS  < 2*CHECKPOINT_SKIPS :
+                #         CHECKPOINT_SKIPS = 2
+                # self.CHECKPOINTS = [1,self.MAX_EPOCHS] + np.repeat( np.divide(self.MAX_EPOCHS,CHECKPOINT_SKIPS),CHECKPOINT_SKIPS ).cumsum().astype(int).tolist() 
+                self.CHECKPOINTS = np.repeat(CHECKPOINT_SKIPS, self.MAX_EPOCHS/ CHECKPOINT_SKIPS).cumsum().astype(int).tolist()
+               
                self.ROW_COUNT = args['real'].shape[0] if 'real' in args else 100
                self.CONTEXT = args['context']
                self.ATTRIBUTES = {"id":args['column_id'] if 'column_id' in args else None,"synthetic":args['column'] if 'column' in args else None}
@ -77,29 +128,53 @@ class GNet :
                for key in ['train','output'] :
                        self.mkdir(os.sep.join([self.log_dir,key]))
                        self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT]))
-            
+                        # if 'partition' in args :
+                        #        self.mkdir (os.sep.join([self.log_dir,key,self.CONTEXT,str(args['partition'])]))
                self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])                
                self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
+                if 'partition' in args :
+                        self.train_dir = os.sep.join([self.train_dir,str(args['partition'])])
+                        self.out_dir = os.sep.join([self.out_dir,str(args['partition'])])
                
-    def load_meta(self,column):
+                for checkpoint in self.CHECKPOINTS :
+                        self.mkdir (os.sep.join([self.train_dir,str(checkpoint)]))
+                        self.mkdir (os.sep.join([self.out_dir,str(checkpoint)]))
+
+                # if self.logger :
+                        
+                #         We will clear the logs from the data-store 
+                        
+                #         column = self.ATTRIBUTES['synthetic']
+                #         db = self.logger.db
+                #         if db[column].count() > 0 :
+                #                 db.backup.insert({'name':column,'logs':list(db[column].find()) })
+                #                 db[column].drop()
+                
+        def load_meta(self,**args):
                """
                This function is designed to accomodate the uses of the sub-classes outside of a strict dependency model.
                Because prediction and training can happen independently
                """
                # suffix = "-".join(column) if isinstance(column,list)else column
-        suffix = self.get.suffix()
+                suffix = self.CONTEXT #self.get.suffix()
                _name = os.sep.join([self.out_dir,'meta-'+suffix+'.json'])
                if os.path.exists(_name) :
                        attr = json.loads((open(_name)).read())
                        for key in attr :
                                value = attr[key]
+                                if not hasattr(self,key):
                                        setattr(self,key,value)
                self.train_dir  = os.sep.join([self.log_dir,'train',self.CONTEXT])                
                self.out_dir = os.sep.join([self.log_dir,'output',self.CONTEXT])
+                # if 'partition' in args :
+                #         self.train_dir = os.sep.join([self.train_dir,str(args['partition'])])
+                #         self.out_dir = os.sep.join([self.out_dir,str(args['partition'])])
                                
                        
        def log_meta(self,**args) :
+                
                _object = {
+                        # '_id':'meta',
                        'CONTEXT':self.CONTEXT,
                        'ATTRIBUTES':self.ATTRIBUTES,
                        'BATCHSIZE_PER_GPU':self.BATCHSIZE_PER_GPU,
@ -108,6 +183,7 @@ class GNet :
                        "D_STRUCTURE":self.D_STRUCTURE,
                        "G_STRUCTURE":self.G_STRUCTURE,
                        "NUM_GPUS":self.NUM_GPUS,
+                        "GPU_CHIPS":self.GPU_CHIPS,
                        "NUM_LABELS":self.NUM_LABELS,
                        "MAX_EPOCHS":self.MAX_EPOCHS,
                        "ROW_COUNT":self.ROW_COUNT
@ -117,14 +193,32 @@ class GNet :
                        value= args['value']
                        object[key] = value
                # suffix = "-".join(self.column) if isinstance(self.column,list) else self.column
-        suffix = self.get.suffix()
+                suffix = self.CONTEXT #self.get.suffix()
                _name = os.sep.join([self.out_dir,'meta-'+suffix])
                
-        f = open(_name+'.json','w')
-        f.write(json.dumps(_object))
+                # f = open(_name+'.json','w')
+                # f.write(json.dumps(_object))
+                # f.close()
+
+                for _info in [{"name":os.sep.join([self.out_dir,'meta-'+suffix+'.json']),"data":_object},{"name":os.sep.join([self.out_dir,'epochs.json']),"data":self.logs['epochs'] if 'epochs' in self.logs else []}] :
+                        f = open(_info['name'],'w')
+                        f.write(json.dumps(_info['data']))
+                        f.close()
                return _object
        def mkdir (self,path):
                if not os.path.exists(path) :
+                        if os.sep in path :
+                                pass
+                                root = []
+                                
+                                for loc in path.strip().split(os.sep) :
+                                        if loc == '' :
+                                                root.append(os.sep)
+                                        root.append(loc)
+                                        if not os.path.exists(os.sep.join(root)) :                                                
+                                                os.mkdir(os.sep.join(root))
+
+                        elif not os.path.exists(path):
                                os.mkdir(path)            
                

@ -141,15 +235,21 @@ class GNet :
                labels  = None if 'labels' not in args else args['labels']
                n_labels= None if 'n_labels' not in args else args['n_labels']
                shift   = [0] if self.__class__.__name__.lower() == 'generator' else [1] #-- not sure what this is doing
-        mean, var   = tf.nn.moments(inputs, shift, keep_dims=True)
-        shape       = inputs.shape[1].value
-        offset_m    = self.get.variables(shape=[n_labels,shape], name='offset'+name,
-                                    initializer=tf.zeros_initializer)
-        scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,
-                                initializer=tf.ones_initializer)
+                # mean, var       = tf.nn.moments(inputs, shift, keep_dims=True)
+                mean, var       = tf.nn.moments(inputs, shift,keepdims=True)
+                # shape           = inputs.shape[1].value
+                shape           = inputs.shape[1]
               
+                if labels is not None:
+                        offset_m        = self.get.variables(shape=[1,shape], name='offset'+name,initializer=tf.zeros_initializer)
+                        scale_m = self.get.variables(shape=[n_labels,shape], name='scale'+name,initializer=tf.ones_initializer)
                        offset  = tf.nn.embedding_lookup(offset_m, labels)
                        scale   = tf.nn.embedding_lookup(scale_m, labels)
+
+                else:
+                        offset = None
+                        scale = None
+
                result  = tf.nn.batch_normalization(inputs, mean, var,offset,scale, 1e-8)
                return result

@ -199,9 +299,11 @@ class Generator (GNet):
                #tf.add_to_collection('glosses', loss)
                tf.compat.v1.add_to_collection('glosses', loss)
                return loss, loss                
-    def load_meta(self, column):
-        super().load_meta(column)
-        self.discriminator.load_meta(column)
+        def load_meta(self, **args):
+                # super().load_meta(**args)
+                self.discriminator.load_meta(**args)
+               
+                
        def network(self,**args) :
                """
                This function will build the network that will generate the synthetic candidates
@ -294,11 +396,17 @@ class Train (GNet):
                self.generator = Generator(**args)
                self.discriminator = Discriminator(**args)
                self._REAL = args['real']
-        self._LABEL= args['label']
-        self.column = args['column']
+                self._LABEL= args['label'] if 'label' in args else None
+                # self.column = args['column']
                # print ([" *** ",self.BATCHSIZE_PER_GPU])
                
                self.meta = self.log_meta()
+                if(self.logger):
+                        
+                        self.logger.write({"module":"gan-train","action":"start","input":{"partition":self.PARTITION,"meta":self.meta} } )
+                
+
+                # self.log (real_shape=list(self._REAL.shape),label_shape = self._LABEL.shape,meta_data=self.meta)
        def load_meta(self, column):
                """
                This function will delegate the calls to load meta data to it's dependents
@ -321,6 +429,9 @@ class Train (GNet):
                stage   = args['stage']
                real    = args['real']
                label   = args['label']
+
+                
+                if label is not None :
                        label   = tf.cast(label, tf.int32)
                        #
                        # @TODO: Ziqi needs to explain what's going on here
@ -335,44 +446,62 @@ class Train (GNet):
                if stage == 'D':
                        w, loss = self.discriminator.loss(real=real, fake=fake, label=label)
                        #losses = tf.get_collection('dlosses', scope)
+                        flag = 'dlosses'
                        losses = tf.compat.v1.get_collection('dlosses', scope)
                else:
                        w, loss = self.generator.loss(fake=fake, label=label)
                        #losses = tf.get_collection('glosses', scope)
+                        flag = 'glosses'
                        losses = tf.compat.v1.get_collection('glosses', scope)
+                # losses = tf.compat.v1.get_collection(flag, scope)

                total_loss = tf.add_n(losses, name='total_loss')
-
+                # print (total_loss)
                return total_loss, w
        def input_fn(self):
                """
                This function seems to produce 
                """
                features_placeholder = tf.compat.v1.placeholder(shape=self._REAL.shape, dtype=tf.float32)
-        labels_placeholder = tf.compat.v1.placeholder(shape=self._LABEL.shape, dtype=tf.float32)
+                LABEL_SHAPE = [None,None] if self._LABEL is None else self._LABEL.shape
+                labels_placeholder = tf.compat.v1.placeholder(shape=LABEL_SHAPE, dtype=tf.float32)
+                if self._LABEL is not None :
                        dataset = tf.data.Dataset.from_tensor_slices((features_placeholder, labels_placeholder))
-        dataset = dataset.repeat(10000)
+                else :
+                        dataset = tf.data.Dataset.from_tensor_slices(features_placeholder)
+                # labels_placeholder = None
+                dataset = dataset.repeat(800000)
+                
                dataset = dataset.batch(batch_size=self.BATCHSIZE_PER_GPU)
                dataset = dataset.prefetch(1)
-        iterator = dataset.make_initializable_iterator()
-        # next_element = iterator.get_next()
-        # init_op = iterator.initializer
+                # iterator = dataset.make_initializable_iterator()
+                iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
                return iterator, features_placeholder, labels_placeholder
        
        def network(self,**args):
-    # def graph(stage, opt):
-        # global_step = tf.get_variable(stage+'_step', [], initializer=tf.constant_initializer(0), trainable=False)
                stage   = args['stage']
                opt     = args['opt']
                tower_grads = []
                per_gpu_w   = []
                iterator, features_placeholder, labels_placeholder = self.input_fn()
                with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()):
-            for i in range(self.NUM_GPUS):
+                        #
+                        # @TODO: Find a way to handle this across multiple CPU in case the GPU are not available
+                        #       - abstract hardware specification
+                        #       - determine if the GPU/CPU are busy
+                        #
+                        for i in self.GPU_CHIPS : #range(self.NUM_GPUS):
+                                
                                with tf.device('/gpu:%d' % i):
                                        with tf.name_scope('%s_%d' % ('TOWER', i)) as scope:
+                                                if self._LABEL is not None :
                                                        (real, label) = iterator.get_next()
-                        loss, w = self.loss(scope=scope, stage=stage, real=self._REAL, label=self._LABEL)
+                                                else:
+                                                        
+                                                        real = iterator.get_next()
+                                                        label= None
+                                                loss, w = self.loss(scope=scope, stage=stage, real=real, label=label)
+                                                
                                                #tf.get_variable_scope().reuse_variables()
                                                tf.compat.v1.get_variable_scope().reuse_variables()
                                                #vars_ = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=stage)
@ -391,6 +520,9 @@ class Train (GNet):
                # max_epochs = args['max_epochs'] if 'max_epochs' in args else 10
                REAL = self._REAL
                LABEL= self._LABEL       
+                if (self.logger):
+                        pass
+                
                with tf.device('/cpu:0'):
                        opt_d = tf.compat.v1.train.AdamOptimizer(1e-4)
                        opt_g = tf.compat.v1.train.AdamOptimizer(1e-4)
@ -398,17 +530,20 @@ class Train (GNet):
                        train_d, w_distance, iterator_d, features_placeholder_d, labels_placeholder_d = self.network(stage='D', opt=opt_d)
                        train_g, _, iterator_g, features_placeholder_g, labels_placeholder_g = self.network(stage='G', opt=opt_g)
                        # saver = tf.train.Saver()
-            saver   = tf.compat.v1.train.Saver()
+                        saver   = tf.compat.v1.train.Saver(max_to_keep=len(self.CHECKPOINTS))
                        # init    = tf.global_variables_initializer()
                        init    = tf.compat.v1.global_variables_initializer()
                        logs = []
+                        self.logs['epochs'] = []
                        #with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
                        with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
+                                
                                sess.run(init)
+                                
                                sess.run(iterator_d.initializer,
-                        feed_dict={features_placeholder_d: REAL, labels_placeholder_d: LABEL})
+                                                        feed_dict={features_placeholder_d: REAL})
                                sess.run(iterator_g.initializer,
-                        feed_dict={features_placeholder_g: REAL, labels_placeholder_g: LABEL})
+                                                        feed_dict={features_placeholder_g: REAL})
                                
                                for epoch in range(1, self.MAX_EPOCHS + 1):
                                        start_time = time.time()
@ -426,21 +561,50 @@ class Train (GNet):
                                        print(format_str % (epoch, -w_sum/(self.STEPS_PER_EPOCH*2), duration))
                                        # print (dir (w_distance))

-                    logs.append({"epoch":epoch,"distance":-w_sum/(self.STEPS_PER_EPOCH*2) })
+                                        # logs.append({"epoch": int(epoch),"distance":float(-w_sum/(self.STEPS_PER_EPOCH*2)) })
                                        
-                    if epoch % self.MAX_EPOCHS == 0:
+                                        suffix = str(self.CONTEXT)
+                                        _name  = os.sep.join([self.train_dir,str(epoch),suffix]) if epoch in self.CHECKPOINTS else ''
+                                        _logentry = {"path":_name,"epochs":int(epoch),"loss":float(-w_sum/(self.STEPS_PER_EPOCH*2))}
+                                        # if epoch % self.MAX_EPOCHS == 0:
+                                        # if epoch in [5,10,20,50,75, self.MAX_EPOCHS] :
+                                        if epoch in self.CHECKPOINTS :
                                                # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
-                        suffix = self.get.suffix()
-                        _name  = os.sep.join([self.train_dir,suffix])
+                                                # suffix = self.CONTEXT #self.get.suffix()
+                                                # _name  = os.sep.join([self.train_dir,str(epoch),suffix])
                                                # saver.save(sess, self.train_dir, write_meta_graph=False, global_step=epoch)
-                        saver.save(sess, _name, write_meta_graph=False, global_step=epoch)
+                                                saver.save(sess, _name, write_meta_graph=False, global_step=np.int64(epoch))
+                                                
                                                #
                                                #
+                                               
+                                                # logs = []
+                                                # if self.logger :
+                                                #         # row = {"module":"gan-train","action":"epochs","input":{"logs":logs}} #,"model":pickle.dump(sess)}                                                        
+                                                #         # self.logger.write(row)
+                                                #         self.logs['epochs'] += logs
+                                                #         #
+                                                #         # @TODO:
+                                                #         # We should upload the files in the checkpoint 
+                                                #         # This would allow the learnt model to be portable to another system
+                                                        #
+                                        self.logs['epochs'].append(_logentry)
+                        tf.compat.v1.reset_default_graph()
+                #
+                # let's sort the epochs we've logged thus far (if any)
+                # Take on the last five checkpoints https://stackoverflow.com/questions/41018454/tensorflow-checkpoint-models-getting-deleted
+                #
+                # self.logs['epochs']  = self.logs['epochs'][-5:]
+                
                if self.logger :
-                            row = {"logs":logs} #,"model":pickle.dump(sess)}
-                            
-                            self.logger.write(row=row)
+                        _log = {'module':'gan-train','context':self.CONTEXT,'action':'epochs','input':self.logs['epochs']}
+                        self.logger.write(_log)
                
+                #
+                # @TODO:
+                # Make another copy of this on disk to be able to load it should we not have a logger setup
+                #
+                self.log_meta()
 class Predict(GNet):
        """
        This class uses synthetic data given a learned model
@ -449,135 +613,221 @@ class Predict(GNet):
                GNet.__init__(self,**args)                
                self.generator  = Generator(**args)                
                self.values     = args['values']
-    def load_meta(self, column):
-        super().load_meta(column)
-        self.generator.load_meta(column)
+                self.ROW_COUNT  = args['row_count']
+                self.oROW_COUNT = self.ROW_COUNT
+                
+                # self.MISSING_VALUES = np.nan_to_num(np.nan)
+                # if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
+                #         self.MISSING_VALUES = args['no_value']
+                self.MISSING_VALUES = args['missing'] if 'missing' in args else []
+                
+                        
+                # self.MISSING_VALUES = args['no_value']
+                # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else  np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value']
+        def load_meta(self, **args):
+                super().load_meta(**args)
+                self.generator.load_meta(**args)
+                self.ROW_COUNT = self.oROW_COUNT
+                #
+                # updating the input/output for the generator, so it points properly
+                #
+                
+                for object in [self,self.generator] :
+                        _train_dir = os.sep.join([self.log_dir,'train',self.CONTEXT,str(self.MAX_EPOCHS)])
+                        _out_dir= os.sep.join([self.log_dir,'output',self.CONTEXT,str(self.MAX_EPOCHS)])
+                        setattr(object,'train_dir',_train_dir)
+                        setattr(object,'out_dir',_out_dir)                
        def apply(self,**args):
-        # print (self.train_dir)
-        # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
-        suffix = self.get.suffix()
+                suffix = self.CONTEXT #self.get.suffix()
                model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
+                # model_dir = os.sep.join([self.train_dir,str(self.MAX_EPOCHS)])
+               
                demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
+                #
+                # setup computational graph
                tf.compat.v1.reset_default_graph()
-        z = tf.random.normal(shape=[self.BATCHSIZE_PER_GPU, self.Z_DIM])
-        y = tf.compat.v1.placeholder(shape=[self.BATCHSIZE_PER_GPU, self.NUM_LABELS], dtype=tf.int32)
+                z = tf.random.normal(shape=[self.ROW_COUNT, self.Z_DIM])
+
+                y = tf.compat.v1.placeholder(shape=[self.ROW_COUNT, self.NUM_LABELS], dtype=tf.int32)
+                if self._LABEL is not None :
                        ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
                        label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
+                else:
+                        label = None

                fake    = self.generator.network(inputs=z, label=label)
                init    = tf.compat.v1.global_variables_initializer()
                saver   = tf.compat.v1.train.Saver()
+                df              = pd.DataFrame()
+                CANDIDATE_COUNT = args['candidates'] if 'candidates' in  args else 1 #0 if self.ROW_COUNT < 1000 else 100
+                candidates = []
+        
+                with tf.compat.v1.Session() as sess:
+                        saver.restore(sess, model_dir)
+                        if self._LABEL is not None :
+                                # labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
+                                labels= demo
+                        else:
+                                labels = None
+                        
+                        for i in np.arange(CANDIDATE_COUNT) :
+                                if labels :
+                                        _matrix = sess.run(fake,feed_dict={y:labels})
+                                else:
+                                        _matrix = sess.run(fake)
+                                #
+                                # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
+                                # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
+                                #
+                                
+                                # df =   pd.DataFrame(np.round(f)).astype(np.int32)
+                                # candidates.append (np.round(_matrix).astype(np.int64)) 
+                                candidates.append(np.array([np.round(row).astype(int) for row in _matrix]))                       
+                # return candidates[0] if len(candidates) == 1 else candidates
+                
+                return [candidates [0]]
+
+        def _apply(self,**args):
+                # print (self.train_dir)
+                # suffix = "-".join(self.ATTRIBUTES['synthetic']) if isinstance(self.ATTRIBUTES['synthetic'],list) else self.ATTRIBUTES['synthetic']
+                suffix = self.CONTEXT #self.get.suffix()
+                model_dir = os.sep.join([self.train_dir,suffix+'-'+str(self.MAX_EPOCHS)])
+                demo = self._LABEL #np.zeros([self.ROW_COUNT,self.NUM_LABELS]) #args['de"shape":{"LABEL":list(self._LABEL.shape)} mo']
+                tf.compat.v1.reset_default_graph()
+                z = tf.random.normal(shape=[self.ROW_COUNT, self.Z_DIM])
+
+                y = tf.compat.v1.placeholder(shape=[self.ROW_COUNT, self.NUM_LABELS], dtype=tf.int32)
+                if self._LABEL is not None :
+                        ma = [[i] for i in np.arange(self.NUM_LABELS - 2)]
+                        label = y[:, 1] * len(ma) + tf.squeeze(tf.matmul(y[:, 2:], tf.constant(ma, dtype=tf.int32)))
+                else:
+                        label = None
+
+                fake    = self.generator.network(inputs=z, label=label)
+                init    = tf.compat.v1.global_variables_initializer()
+                saver   = tf.compat.v1.train.Saver()
+                df              = pd.DataFrame()
+                CANDIDATE_COUNT = 5 #0 if self.ROW_COUNT < 1000 else 100
+                NTH_VALID_CANDIDATE = count = np.random.choice(np.arange(2,60),2)[0]
                with tf.compat.v1.Session() as sess:
                        
                        # sess.run(init)
-            saver.restore(sess, model_dir)
-            labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )

+                        saver.restore(sess, model_dir)
+                        if self._LABEL is not None :
+                                labels = np.zeros((self.ROW_COUNT,self.NUM_LABELS) )
                                labels= demo
-            f = sess.run(fake,feed_dict={y:labels})
+                        else:
+                                labels = None
+                        
+                        found = []
+                        ratio = []
+                        __x__ = None
+                        __ratio=0
+                        for i in np.arange(CANDIDATE_COUNT) :
+                                if labels :
+                                        _matrix = sess.run(fake,feed_dict={y:labels})
+                                else:
+                                        _matrix = sess.run(fake)
                                #
                                # if we are dealing with numeric values only we can perform a simple marginal sum against the indexes
+                                # The code below will insure we have some acceptable cardinal relationships between id and synthetic values
                                #
                                
-            df =  ( pd.DataFrame(np.round(f).astype(np.int32)))
+                                # df =   pd.DataFrame(np.round(f)).astype(np.int32)
+                                found.append (np.round(_matrix).astype(np.int64))
+                                # df =   pd.DataFrame(np.round(_matrix),dtype=int)
+                                p = 0 not in df.sum(axis=1).values
+                                # x = df.sum(axis=1).values
+                                
+                                # if np.divide( np.sum(x), x.size)  > .9 or p and np.sum(x) == x.size :
+                                #         ratio.append(np.divide( np.sum(x), x.size))
+                                #         found.append(df)
+                                        
+                                #         # break
+                                #         if len(found) == CANDIDATE_COUNT:
+                                                
+                                #                 break
+                                # else:
+                                #         __x__   = df if  __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __x__
+                                #         __ratio = np.divide( np.sum(x), x.size) if __x__ is None or np.where(x > 0)[0].size > np.where(__x__ > 0)[0].size else __ratio
+                                #         continue
+                                       
                        # i = df.T.index.astype(np.int32) #-- These are numeric pseudonyms
                        # df = (i * df).sum(axis=1)
                        #
                        # In case we are dealing with actual values like diagnosis codes we can perform 
                        #
-            columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
+                        # N = len(found)
+                        # _index = [i for i in range(0,N) if found[i].shape[1] == len(self.values)]
+                        # if not _index and not found :
+                        #         df = __x__
+                        #         INDEX = -1
+                        # else :
+                        #         if not _index :
+                        #                 INDEX = np.random.choice(np.arange(len(found)),1)[0]
+                        #                 INDEX = ratio.index(np.max(ratio))
+                        #         else:
+                        #                 INDEX = _index[0]

-            r = np.zeros((self.ROW_COUNT,len(columns)))
-            for col in df :
-                i = np.where(df[col])[0]
-                r[i] = col
                        
-            df = pd.DataFrame(r,columns=columns)
+                        #         df = found[INDEX]
+                        # columns = self.ATTRIBUTES['synthetic'] if isinstance(self.ATTRIBUTES['synthetic'],list)else [self.ATTRIBUTES['synthetic']]
                        
-            df[df.columns] = (df.apply(lambda value: self.values[ int(value)],axis=1))
-            return df.to_dict(orient='lists')
+                        # r = np.zeros((self.ROW_COUNT,len(columns)))
+                        # r = np.zeros(self.ROW_COUNT)
+                        
+                        # if self.logger :
+                        #         info = {"found":len(found),"rows":df.shape[0],"cols":df.shape[1],"expected":len(self.values)}
+                        #         if df.shape[1] > len(self.values) :
+                        #                 df = df.iloc[:len(self.values)]
+                        #         if INDEX > 0 :
+                        #                 info =dict(info ,**{"selected":INDEX, "ratio": ratio[INDEX] })
+                        #         else :
+                                        
+                        #                 info['selected'] = -1
+                        #                 info['ratio'] = __ratio
+                        #         info['partition'] = self.PARTITION
+                        #         self.logger.write({"module":"gan-generate","action":"generate","input":info})
+                        # # df.columns = self.values
+                        # if len(found) or df.columns.size <= len(self.values):
+                        #         ii = df.apply(lambda row: np.sum(row) == 0 ,axis=1)
+                        #         missing = []
+                        #         if ii.sum() > 0 :
+                        #                 #
+                        #                 # If the generator had a reductive effect we should be able to get random values from either :
+                        #                 #       - The space of outliers
+                        #                 #       - existing values for smaller spaces that have suffered over training
+                        #                 #
+
+                        #                 N = ii.sum() 
+                        #                 missing_values = self.MISSING_VALUES if self.MISSING_VALUES else self.values
+                        #                 missing = np.random.choice(missing_values,N)
+                        #                 # missing = []
+                        #         #
+                        #         # @TODO:
+                        #         #       Log the findings here in terms of ratio, missing, candidate count
+                        #         # print ([np.max(ratio),len(missing),len(found),i])
+                        #         i = np.where(ii == 0)[0]
+                                
+                                
+                        #         df =  pd.DataFrame( df.iloc[i].apply(lambda row: self.values[np.random.choice(np.where(row != 0)[0],1)[0]] ,axis=1))
+                        #         df.columns = columns
+                        #         df = df[columns[0]].append(pd.Series(missing))
+                                
+                                
+                        #         if self.logger :
+                                        
+                        #                 info= {"missing": i.size,"rows":df.shape[0],"cols":1,'partition':self.PARTITION}
+                        #                 self.logger.write({"module":"gan-generate","action":"compile.io","input":info})
+
+                           
+                        
+                # print(df.head())
+                tf.compat.v1.reset_default_graph()
+                # df = pd.DataFrame(df)
+                # df.columns = columns
+                # np.random.shuffle(df[columns[0]].values)
                # return df.to_dict(orient='list')
-            # count = str(len(os.listdir(self.out_dir)))
-            # _name = os.sep.join([self.out_dir,self.CONTEXT+'-'+count+'.csv'])
-            # df.to_csv(_name,index=False)
-
-            
-            # output.extend(np.round(f))
-
-            # for m in range(2):
-            #     for n in range(2, self.NUM_LABELS):
-            #         idx1 = (demo[:, m] == 1)
-            #         idx2 = (demo[:, n] == 1)
-            #         idx = [idx1[j] and idx2[j] for j in range(len(idx1))]
-            #         num = np.sum(idx)
-            #         print ("___________________list__")
-            #         print (idx1)
-            #         print (idx2)
-            #         print (idx)
-            #         print (num)
-            #         print ("_____________________")
-            #         nbatch = int(np.ceil(num / self.BATCHSIZE_PER_GPU))
-            #         label_input = np.zeros((nbatch*self.BATCHSIZE_PER_GPU, self.NUM_LABELS))
-            #         label_input[:, n] = 1
-            #         label_input[:, m] = 1
-            #         output = []
-            #         for i in range(nbatch):
-            #             f = sess.run(fake,feed_dict={y: label_input[i* self.BATCHSIZE_PER_GPU:(i+1)* self.BATCHSIZE_PER_GPU]})
-            #             output.extend(np.round(f))
-            #         output = np.array(output)[:num]
-                    # print ([m,n,output])
-                    
-                    # np.save(self.out_dir + str(m) + str(n), output)
-    
-
-if __name__ == '__main__' :
-    #
-    # Now we get things done ...
-    column      = SYS_ARGS['column']
-    column_id   = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id'
-    column_id   = column_id.split(',') if ',' in column_id else column_id
-    df = pd.read_csv(SYS_ARGS['raw-data'])  
-    LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
-    
-    context     = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4]
-    if set(['train','learn']) & set(SYS_ARGS.keys()):
-        
-        df = pd.read_csv(SYS_ARGS['raw-data'])   
-        
-        # cols = SYS_ARGS['column']
-        # _map,_df = (Binary()).Export(df)
-        # i = np.arange(_map[column]['start'],_map[column]['end'])
-        max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10
-        # REAL    = _df[:,i]
-        REAL    = pd.get_dummies(df[column]).astype(np.float32).values
-        LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
-        trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id)
-        trainer.apply()
-        
-        
-       
-        
-        #
-        # We should train upon this data
-        #
-        # -- we need to convert the data-frame to binary matrix, given a column
-        #
-        pass
-    elif 'generate' in SYS_ARGS:
-        values = df[column].unique().tolist()
-        values.sort()
-        
-        p = Predict(context=context,label=LABEL,values=values,column=column)
-        p.load_meta(column)
-        r = p.apply()
-        print (df)
-        print ()
-        df[column] = r[column]
-        print (df)
-        
-        
-    else:
-        print (SYS_ARGS.keys())
-        print (__doc__)
-    pass
+                return _matrix

--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -11,68 +11,592 @@ This package is designed to generate synthetic data from a dataset from an origi
 import pandas as pd
 import numpy as np
 import data.gan as gan
-from transport import factory
-def train (**args) :
-    """
-    This function is intended to train the GAN in order to learn about the distribution of the features
-    :column     columns that need to be synthesized (discrete)
-    :logs       where the output of the (location on disk)
-    :id         identifier of the dataset
-    :data       data-frame to be synthesized
-    :context    label of what we are synthesizing
-    """
-    column      = args['column']
+import transport
+# from data.bridge import Binary
+import threading
+from data.maker import prepare
+from data.maker.state import State
+import copy
+import os
+import nujson as json
+from multiprocessing import Process, RLock
+from datetime import datetime, timedelta
+from multiprocessing import Queue

-    column_id   = args['id']
-    df          = args['data']
-    logs        = args['logs']
-    real        = pd.get_dummies(df[column]).astype(np.float32).values
-    labels      = pd.get_dummies(df[column_id]).astype(np.float32).values
-    num_gpu     = 1 if 'num_gpu' not in args else args['num_gpu']
-    max_epochs  = 10 if 'max_epochs' not in args else args['max_epochs']
-    context     = args['context']
-    if 'store' in args :
-        args['store']['args']['doc'] = context
-        logger = factory.instance(**args['store'])
+import time
+
+
+
+class Learner(Process):
+
+    def __init__(self,**_args):
+        
+        
+        super(Learner, self).__init__() 
+        self.ndx = 0
+        self._queue = Queue()
+        self.lock = RLock()
+        if 'gpu' in _args :
+            
+            os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
+            self.gpu = int(_args['gpu'])
+        else:
+            self.gpu = None
+        
+        self.info = _args['info']
+        self.columns    = self.info['columns'] if 'columns' in self.info else None
+        self.store      = _args['store']
+        
+        if 'network_args' not in _args :
+            self.network_args ={
+                'context':self.info['context'] ,
+                'logs':_args['logs'] if 'logs' in _args else 'logs',
+                'max_epochs':int(_args['epochs']) if 'epochs' in _args else 2,
+                'batch_size':int (_args['batch']) if 'batch' in _args else 2000
+            }
+        else:
+            self.network_args = _args['network_args']
+        self._encoder = None
+        self._map = None
+        self._df = _args['data'] if 'data' in _args else None
+        
+        self.name   =  self.__class__.__name__
+        
+        #
+        # @TODO: allow for verbose mode so we have a sens of what is going on within the newtork
+        #
+        
+        _log = {'action':'init','gpu':(self.gpu if self.gpu is not None else -1)}
+        self.log(**_log)
+        self.cache = []
+        # self.logpath= _args['logpath'] if 'logpath' in _args else 'logs'
+        # sel.max_epoc
+        self.logger = None
+        if 'logger' in self.store :
+            self.logger = transport.factory.instance(**self.store['logger'])
+        self.autopilot = False  #-- to be set by caller
+        self._initStateSpace()
+    def _initStateSpace(self):
+        """
+        Initializing state-space for the data-maker, The state-space functions are used as pre-post processing functions applied to the data accordingly i.e 
+            - Trainer    -> pre-processing
+            - Generation -> post processing
+        The specifications of a state space in the configuration file is as such
+        state:{pre:{path,pipeline:[]}, post:{path,pipeline:[]}}
+        """
+        self._states = None
+        
+        if 'state' in self.info :
+            try:
+                _config = self.info ['state']
+                self._states = State.instance(_config)
+            except Exception as e:
+                print (e)
+                pass
+            finally:
+                # __info = (pd.DataFrame(self._states)[['name','path','args']]).to_dict(orient='records')
+                if self._states :
+                    __info = {}
+                    
+                    for key in self._states :
+                        __info[key] = [{"name":_item['name'],"args":_item['args'],"path":_item['path']} for _item in self._states[key]]
+                    self.log(object='state-space',action='load',input=__info)
+        
+
+        
+    def log(self,**_args):
+        
+        try:
+            _context = self.info['context']
+            _label = self.info['info'] if 'info' in self.info else _context
+            # logger = 
+            _args = dict({'ndx':self.ndx,'module':self.name,'table':self.info['from'],'context':_context,'info':_label,**_args})
+            if 'logger' in self.store :
+                logger = transport.factory.instance(**self.store['logger']) if 'logger' in self.store else transport.factory.instance(provider=transport.providers.CONSOLE,context='write',lock=True)
+                logger.write(_args)
+            self.ndx += 1
+            # if hasattr(logger,'close') :
+            #     logger.close()
+            pass
+        except Exception as e:
+            # print ()
+            # print (_args)
+            # print (e)
+            pass
+        finally:
+           
+            pass
+    def get_schema(self):
+        # if self.store['source']['provider'] != 'bigquery' :
+        #     return [] #{'name':self._df.dtypes.index.tolist()[i],'type':self._df.dtypes.astype(str).tolist()[i]}for i in range(self._df.dtypes.shape[0])]
+        # else:
+        #     reader  = transport.factory.instance(**self.store['source'])
+        #     return reader.meta(table=self.info['from'])
+        reader  = transport.factory.instance(**self.store['source'])
+        return reader.meta(table=self.info['from'])
+
+    def initalize(self):
+        reader  = transport.factory.instance(**self.store['source'])
+        _read_args= self.info
+        if self._df is None :
+            self._df     = reader.read(**_read_args)
+        #
+        # NOTE : PRE
+        # At this point we apply pre-processing of the data if there were ever a need for it
+        #
+        _log = {}
+        HAS_STATES = self._states is not None and 'pre' in self._states
+        NOT_GENERATING = self.name in ['Trainer','Shuffle']
+        IS_AUTOPILOT = self.autopilot
+        #
+        # allow calling pre-conditions if either of the conditions is true
+        #   1. states and not generating
+        #   2. IS_GENERATING and states and not autopilot
+        _ALLOW_PRE_CALL = (HAS_STATES and NOT_GENERATING) or (NOT_GENERATING is False and HAS_STATES and IS_AUTOPILOT is False)
+        if _ALLOW_PRE_CALL :
+        # if HAS_STATES and NOT_GENERATING or (HAS_STATES and IS_AUTOPILOT is False and NOT_GENERATING is False):
+            _logs = {'action':'status','input':{'pre':self._states['pre']}}
+            _beg = list(self._df.shape)
+            self._df = State.apply(self._df,self._states['pre'])
+            _end = list(self._df.shape)
+            _logs['input']['size'] = _beg,_end
+            self.log(**_log)
+
+        #
+        #
+
+        columns = self.columns if self.columns else self._df.columns
+        #
+        # Below is a source of inefficiency, unfortunately python's type inference doesn't work well in certain cases
+        # - The code below tries to address the issue (Perhaps better suited for the reading components)
+        
+        for name in columns :
+            #
+            # randomly sampling 5 elements to make sense of data-types
+            if self._df[name].size < 5 :
+                continue
+            _index = np.random.choice(np.arange(self._df[name].size),5,False)
+            no_value = [type(value) in [int,float,np.int64,np.int32,np.float32,np.float64] for value in self._df[name].values[_index]]            
+            no_value = 0 if np.sum(no_value) > 0 else ''
+            try:
+                self._df[name] = self._df[name].fillna(no_value)
+            finally:
+                pass
+            
+            _log[name] = self._df[name].dtypes.name
+        _log = {'action':'structure','input':_log}
+        self.log(**_log)
+        #
+        # convert the data to binary here ...
+        _schema = self.get_schema()       
+        _args = {"schema":_schema,"data":self._df,"columns":columns}
+        if self._map :
+            _args['map'] = self._map
+        self._encoder = prepare.Input(**_args)  if self._df.shape[0] > 0 else None             
+        _log = {'action':'data-prep','input':{'rows':int(self._df.shape[0]),'cols':int(self._df.shape[1]) } }
+        self.log(**_log)
+    def get(self):
+        
+        if self.cache :
+            return self.cache if len(self.cache) > 0 else(self.cache if not self.cache else self.cache[0])
+        else:
+            return self._queue.get() if self._queue.qsize() >  0 else []
+         
+    def listen(self):
+        while True :
+            _info = self._queue.get()
+            self.cache.append(_info)
+            self._queue.task_done()
+    def publish(self,caller):
+        if hasattr(caller,'_queue') :           
+            _queue = caller._queue
+            _queue.put(self.cache)
+            
+            # _queue.join()
+        pass
+class Trainer(Learner):
+    """
+    This will perform training using a GAN
+    """
+    def __init__(self,**_args):
+        super().__init__(**_args)
+        # self.info   = _args['info']
+        self.limit  = int(_args['limit']) if 'limit' in _args else None
+        
+        self.autopilot = _args['autopilot'] if 'autopilot' in _args else False
+        self.generate = None
+        self.candidates = int(_args['candidates']) if 'candidates' in _args else 1
+        self.checkpoint_skips = _args['checkpoint_skips'] if 'checkpoint_skips' in _args else None
+    def run(self):
+        self.initalize()
+        if self._encoder is None :
+            #
+            # @TODO Log that the dataset was empty or not statistically relevant
+            return 
+        _space,_matrix = self._encoder.convert()
+        
+        _args   = self.network_args
+        if self.gpu :
+            _args['gpu'] = self.gpu
+        _args['real'] = _matrix
+        _args['candidates'] = self.candidates
+        if 'logger' in self.store :
+            _args['logger'] = transport.factory.instance(**self.store['logger'])
+        if self.checkpoint_skips :
+            _args['checkpoint_skips'] = self.checkpoint_skips
+        #
+        # At this point we have the binary matrix, we can initiate training
+        #
+        beg = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S')
+        gTrain = gan.Train(**_args)
+        gTrain.apply()
+       
+        writer = transport.factory.instance(provider=transport.providers.FILE,context='write',path=os.sep.join([gTrain.out_dir,'map.json']))
+        
+        writer.write(self._encoder._map,overwrite=True)
+        writer.close()
+
+        #
+        # @TODO: At this point we need to generate another some other objects
+        #
+        _args = {"network_args":self.network_args,"store":self.store,"info":self.info,"candidates":self.candidates,"data":self._df}
+        _args['logs'] = self.network_args['logs']
+        _args['autopilot'] = self.autopilot
+        if self.gpu :
+            _args['gpu'] = self.gpu
+
+        #
+        # Let us find the smallest, the item is sorted by loss on disk
+        # 
+        _epochs = [_e  for _e in gTrain.logs['epochs'] if _e['path'] != '']
+        _epochs.sort(key=lambda _item: _item['loss'],reverse=False)
+       
+        _args['network_args']['max_epochs'] = _epochs[0]['epochs']
+        self.log(action='autopilot',input={'epoch':_epochs[0]})
+        g = Generator(**_args)
+        # g.run() 
+        
+        end = datetime.now() #.strftime('%Y-%m-%d %H:%M:%S')
+        _min = float((end-beg).seconds/ 60)
+        _logs = {'action':'train','input':{'start':beg.strftime('%Y-%m-%d %H:%M:%S'),'minutes':_min,"unique_counts":self._encoder._io[0]}}
+        self.log(**_logs)
+        self._g = g
+        if self.autopilot :            
+            self._g.run()   
+            #
+            #@TODO Find a way to have the data in the object ....
+
+   
+   
+    def generate (self):
+        if self.autopilot :
+            print( "Autopilot is set ... No need to call this function")
+        else:
+            raise Exception( "Autopilot has not been, Wait till training is finished. Use is_alive function on process object")
+
+class Generator (Learner):
+    def __init__(self,**_args):
+        super().__init__(**_args)
+        #
+        # We need to load the mapping information for the space we are working with ...
+        #
+        self.network_args['candidates'] = int(_args['candidates']) if 'candidates' in _args else 1
+        filename = os.sep.join([self.network_args['logs'],'output',self.network_args['context'],'map.json'])
+        self.log(**{'action':'init-map','input':{'filename':filename,'exists':os.path.exists(filename)}})
+        if os.path.exists(filename):
+            file = open(filename)
+            self._map = json.loads(file.read())
+            file.close()
+        else:
+            self._map = {}
+        self.autopilot = False if 'autopilot' not in _args else _args['autopilot']
+    def run(self):
+        self.initalize()
+        if self._encoder is None :
+            #
+            # @TODO Log that the dataset was empty or not statistically relevant
+            return 
+        
+        #
+        # The values will be returned because we have provided _map information from the constructor
+        #
+
+        values,_matrix = self._encoder.convert()
+        _args = self.network_args
+        _args['map'] = self._map
+        _args['values'] = np.array(values)
+        _args['row_count'] = self._df.shape[0]
+        if self.gpu :
+            _args['gpu'] = self.gpu
+        if 'logger' in self.store :
+            _args['logger'] = transport.factory.instance(**self.store['logger'])        
+        gHandler = gan.Predict(**_args)
+        gHandler.load_meta(columns=None)
+        _iomatrix = gHandler.apply()        
+        _candidates= [ self._encoder.revert(matrix=_item) for _item in _iomatrix]
+        
+        _size = np.sum([len(_item) for _item in _iomatrix])
+        _log = {'action':'io-data','input':{'candidates':len(_candidates),'rows':int(_size)}}
+        self.log(**_log)
+        # self.cache = _candidates
+        self.post(_candidates)
+    def approximate(self,_df):
+        _columns = self.info['approximate']
+ 
+        for name in _columns :
+            if _df[name].size > 100 :
+                BATCH_SIZE = 10
                
            else:
-        logger = None
+                BATCH_SIZE = 1
+            batches = np.array_split(_df[name].fillna(np.nan).values,BATCH_SIZE)
+            _type = np.int64 if 'int' in self.info['approximate'][name]else np.float64
+            x = []
+            _log = {'action':'approximate','input':{'batch':BATCH_SIZE,'col':name}}
+            for values in batches :
               
-    trainer     = gan.Train(context=context,max_epochs=max_epochs,num_gpu=num_gpu,real=real,label=labels,column=column,column_id=column_id,logger = logger,logs=logs)
-    return trainer.apply()
-
-def generate(**args):
-    """
-    This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
-    @return pandas.DataFrame
-
-    :data   data-frame to be synthesized
-    :column   columns that need to be synthesized (discrete)
-    :id     column identifying an entity
-    :logs   location on disk where the learnt knowledge of the dataset is
-    """
-    df      = args['data']
-    
-    column      = args['column'] 
-    column_id   = args['id']
-    logs        = args['logs']
-    context = args['context']
-    num_gpu     = 1 if 'num_gpu' not in args else args['num_gpu']
-    max_epochs  = 10 if 'max_epochs' not in args else args['max_epochs']
+                index = [ _x not in ['',None,np.nan] for _x in values]
                
+                if np.sum(index) == 0:
                    #
-    #@TODO:
-    #   If the identifier is not present, we should fine a way to determine or make one
-    #
-    #ocolumns= list(set(df.columns.tolist())- set(columns))
+                    # Sometimes messy data has unpleasant surprises
+                    continue
                
-    values = df[column].unique().tolist()
-    values.sort()
+                _values = np.random.rand( len(values[index]))   
+                _values += np.std(values[index]) / 4
                
-    labels = pd.get_dummies(df[column_id]).astype(np.float32).values
-    handler = gan.Predict (context=context,label=labels,max_epochs=max_epochs,num_gpu=num_gpu,values=values,column=column,logs=logs)
-    handler.load_meta(column)
-    r =  handler.apply()
-    _df = df.copy()
-    _df[column] = r[column]
+                values[index] = list(values[index] + _values )if np.random.randint(0,2) else list(values[index] - _values)
+                values[index] = values[index].astype(_type)
+                x += values.tolist()
+            
+            if x :   
+                _log['input']['identical_percentage'] = 100 * (np.divide( (_df[name].dropna() == x).sum(),_df[name].dropna().size))
+               
+                _df[name] = x  #np.array(x,dtype=np.int64) if 'int' in _type  else np.arry(x,dtype=np.float64)
+                self.log(**_log)
        return _df
+    def make_date(self,**_args) :
+        """
+        :param year  initial value        
+        """
+        if _args['year'] in ['',None,np.nan] :
+            return None
+        year = int(_args['year'])
+        
+        offset = _args['offset'] if 'offset' in _args else 0
+        month   = np.random.randint(1,13)
+        if month == 2:
+            _end = 28 if year % 4 != 0 else 29
+        else:
+            _end = 31 if month in [1,3,5,7,8,10,12] else 30
+        day = np.random.randint(1,_end)
+
+        #-- synthetic date
+        _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
+        FORMAT =  '%Y-%m-%d'
+        _name = _args['field'] if 'field' in _args else None
+        if 'format' in self.info and _name in self.info['format']:
+            # _name = _args['field']
+            FORMAT = self.info['format'][_name]
+        
+
+        # print ([_name,FORMAT, _date.strftime(FORMAT)])
+        r = []
+        if offset :
+            r = [_date.strftime(FORMAT)]
+            for _delta in offset :
+                _date = _date + timedelta(_delta)
+                r.append(_date.strptime(FORMAT))
+            return r
+        else:
+            return _date.strftime(FORMAT)
+
+        pass
+    def format(self,_df,_schema):
+        r = {}
+        
+        for _item in _schema :
+            name = _item['name']
+            
+            if _item['type'].upper() in ['DATE','DATETIME','TIMESTAMP'] :
+                FORMAT = '%Y-%m-%d'
+                
+                try:
+                    #
+                    #-- Sometimes data isn't all it's meant to be
+                    SIZE = -1
+                    if 'format' in self.info and name in self.info['format'] :
+                        FORMAT = self.info['format'][name]
+                        SIZE = 10
+                    elif _item['type'] in ['DATETIME','TIMESTAMP'] :
+                            FORMAT = '%Y-%m-%-d %H:%M:%S'
+                            SIZE = 19
+                    
+                    if SIZE > 0 :
+                        
+                        values = pd.to_datetime(_df[name], format=FORMAT).astype(np.datetime64)
+                        # _df[name] = [_date[:SIZE].strip() for _date in values]
+                        
+                       
+                    # _df[name] = _df[name].astype(str)
+                    r[name] = FORMAT
+                    # _df[name] = pd.to_datetime(_df[name], format=FORMAT) #.astype('datetime64[ns]')
+                    if _item['type'] in ['DATETIME','TIMESTAMP']:                   
+                        pass #;_df[name] = _df[name].fillna('').astype('datetime64[ns]')
+                    
+                except Exception as e:
+                    pass
+                finally:
+                    pass
+            else:
+                
+                #
+                # Because types are inferred on the basis of the sample being processed they can sometimes be wrong
+                #   To help disambiguate we add the schema information
+                _type = None
+                
+                if 'int' in _df[name].dtypes.name or 'int' in _item['type'].lower():                    
+                    _type = np.int
+                    
+                elif 'float' in _df[name].dtypes.name or 'float' in _item['type'].lower():
+                    _type = np.float
+                if _type :
+                    
+                    _df[name] = _df[name].fillna(0).replace(' ',0).replace('',0).replace('NA',0).replace('nan',0).astype(_type)
+                # else:
+                #     _df[name] = _df[name].astype(str)
+        # _df = _df.replace('NaT','').replace('NA','')
+        
+        if r :
+            self.log(**{'action':'format','input':r})
+        return _df
+
+        pass
+    def post(self,_candidates):
+        
+        if 'target'  in self.store :
+            _store = self.store['target'] if 'target' in self.store else {'provider':'console'}
+            _store['lock'] = True
+            _store['context'] = 'write' #-- Just in case
+            if 'table' not in _store :
+                _store['table'] = self.info['from']
+        else:
+            _store = None
+        N = 0
+        for _iodf in _candidates :
+            _df = self._df.copy()
+            _df[self.columns] = _iodf[self.columns]
+            N += _df.shape[0]
+            if self._states and 'post' in self._states:
+                _df = State.apply(_df,self._states['post'])
+            # #
+            # #@TODO:
+            # # Improve formatting with better post-processing pipeline
+            # if 'approximate' in self.info :                
+            #     _df = self.approximate(_df)
+            # if 'make_date' in self.info :
+            #     for name in self.info['make_date'] :
+            #         # iname = self.info['make_date']['init_field']
+            #         iname = self.info['make_date'][name]
+
+            #         years = _df[iname]
+            #         _dates = [self.make_date(year=_year,field=name) for _year in years]
+            #         if _dates :                         
+            #             _df[name] = _dates
+                        
+                        
+           
+            _schema = self.get_schema()
+            
+            _df = self.format(_df,_schema)
+            _log = [{"name":_schema[i]['name'],"dataframe":_df[_df.columns[i]].dtypes.name,"schema":_schema[i]['type']} for i in np.arange(len(_schema)) ]
+            self.log(**{"action":"consolidate","input":_log})
+
+           
+            if _store :
+                writer = transport.factory.instance(**_store)
+                if _store['provider'] == 'bigquery':
+                    writer.write(_df,schema=[],table=self.info['from'])
+                else:
+                    writer.write(_df,table=self.info['from'])
+            else:
+                self.cache.append(_df)
+                
+            
+
+           
+        
+        self.log(**{'action':'write','input':{'rows':N,'candidates':len(_candidates)}})
+class Shuffle(Generator):    
+    """
+    This is a method that will yield data with low utility
+    """
+    def __init__(self,**_args):
+        super().__init__(**_args)
+    def run(self):
+        
+        np.random.seed(1)
+        self.initalize()
+        _index = np.arange(self._df.shape[0])
+        np.random.shuffle(_index)
+        np.random.shuffle(_index)
+        _iocolumns = self.info['columns']
+        _ocolumns = list(set(self._df.columns) - set(_iocolumns) )
+        # _iodf = pd.DataFrame(self._df[_ocolumns],self._df.loc[_index][_iocolumns],index=np.arange(_index.size))
+        _iodf = pd.DataFrame(self._df[_iocolumns].copy(),index = np.arange(_index.size))
+        # self._df = self._df.loc[_index][_ocolumns].join(_iodf)
+        self._df = self._df.loc[_index][_ocolumns]
+        self._df.index = np.arange(self._df.shape[0])
+        self._df = self._df.join(_iodf)
+        #
+        # The following is a full shuffle 
+        self._df = self._df.loc[_index]
+        self._df.index = np.arange(self._df.shape[0])
+        
+        
+        _log = {'action':'io-data','input':{'candidates':1,'rows':int(self._df.shape[0])}}
+        self.log(**_log)
+        try:
+            self.post([self._df])
+            self.log(**{'action':'completed','input':{'candidates':1,'rows':int(self._df.shape[0])}})
+        except Exception as e :
+            # print (e)
+            self.log(**{'action':'failed','input':{'msg':e,'info':self.info}})
+class apply :
+    TRAIN,GENERATE,RANDOM = 'train','generate','random'
+class factory :
+    _infocache = {}
+    @staticmethod
+    def instance(**_args):
+        """
+        An instance of an object that trains and generates candidate datasets
+        :param gpu (optional)         index of the gpu to be used if using one
+        :param store                  {source,target} if no target is provided console will be output
+        :param epochs (default 2)     number of epochs to train
+        :param candidates(default 1)  number of candidates to generate
+        :param info             {columns,sql,from}
+        :param autopilot        will generate output automatically
+        :param batch (default 2k)           size of the batch
+        
+        """
+        
+        
+        if _args['apply'] in [apply.RANDOM] :
+            pthread =  Shuffle(**_args)
+        elif _args['apply'] == apply.GENERATE :
+            pthread =  Generator(**_args)
+        else:
+            pthread= Trainer(**_args)
+        if 'start' in _args and _args['start'] == True :
+            pthread.start()
+        return pthread
+
+class plugins:
+    @staticmethod
+    def load(_config):
+        """
+        This function attempts to load the plugins to insure they are valid
+        _config configuration for plugin specifications {pre:{pipeline,path},post:{pipeline,path}}
+        """
+
+
--- a/data/maker/main.py
+++ b/data/maker/main.py
@ -1,10 +0,0 @@
-import pandas as pd
-import data.maker
-
-df      = pd.read_csv('sample.csv')
-column  = 'gender'
-id      = 'id' 
-context = 'demo'
-store = {"type":"mongo.MongoWriter","args":{"host":"localhost:27017","dbname":"GAN"}}
-max_epochs = 11
-data.maker.train(store=store,max_epochs=max_epochs,context=context,data=df,column=column,id=id,logs='foo')
--- a/data/maker/apply.py
+++ b/data/maker/apply.py
@ -0,0 +1,76 @@
+"""
+This file is designed to specify the appliction of pre/post-processing code. 
+    The pre-processing code gets applied after the data has been loaded
+    The post-processing code get applied after the data has been generated for instance:
+        -approximation code/logic; date shifting; suppression; adding noise
+        - 
+"""
+import numpy as np
+from datetime import datetime, timedelta
+import time
+
+class Phase:
+    def __init__(self,**_args):
+        self._df = _args['data']
+        self.callback = _args['callback']
+    def apply(self,**_args):
+        """
+            :param  data        data-frame
+            :param  _info       arguments needed to be applied
+            :param  callback    callback function once done
+        """
+        raise Exception ("Function needs to be Implemented")
+class Pre(Phase):
+    pass
+class Post(Phase):
+    def __init__(self,**_args):
+        super().__init__(**_args)
+    pass
+
+class Date(Post):
+    def __init__(self,**_args):
+        super().__init__(**_args)
+    def make(self,**_args):
+        """
+        This function generates a random date given a year and optionally a set of days from the randomly generated date
+        :param year     initial value of a year
+        :param offset   list of days between initial date    
+        """
+        if _args['year'] in ['',None,np.nan] :
+            return None
+        year = int(_args['year'])
+
+        offset = _args['offset'] if 'offset' in _args else 0
+        month   = np.random.randint(1,13)
+        if month == 2:
+            _end = 28 if year % 4 != 0 else 29
+        else:
+            _end = 31 if month in [1,3,5,7,8,10,12] else 30
+        day = np.random.randint(1,_end)
+
+        #-- synthetic date
+        _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
+        FORMAT =  '%Y-%m-%d' if 'format' not in _args else _args['format']
+
+
+
+        # print ([_name,FORMAT, _date.strftime(FORMAT)])
+        r = []
+        if offset :
+            r = [_date.strftime(FORMAT)]
+            for _delta in offset :
+                _date = _date + timedelta(_delta)
+                r.append(_date.strptime(FORMAT))
+            return r
+        else:
+            return _date.strftime(FORMAT)
+        
+    def apply(self,**_args):
+        """
+        
+        """
+      pass
+class Approximate(Post):
+    def apply(**_args):
+        pass
+    def applyWithRange(**_args):
--- a/data/maker/prepare/init.py
+++ b/data/maker/prepare/init.py
@ -0,0 +1,284 @@
+"""
+(c) 2018 - 2021, Vanderbilt University Medical Center
+Steve L. Nyemba, steve.l.nyemba@vumc.org
+
+This file is designed to handle preconditions for a generative adversarial network:
+    - The file will read/get data from a source specified by transport (or data-frame)
+    - The class will convert the data to a binary vector
+    - The class will also help rebuild the data from a binary matrix.
+Usage :
+
+"""
+import transport
+import json
+import pandas as pd
+import numpy as np
+# import cupy as cp
+import sys
+import os
+#
+# The following is to address the issue over creating a large matrix ...
+# 
+# from multiprocessing import Process, Queue
+
+# if 'GPU' in os.environ :
+#     import cupy as np
+# else:
+#     import numpy as np    
+class void:
+    pass
+class Hardware :
+    """
+    This class is intended to allow the use of hardware i.e GPU, index or CPU
+    """
+    pass
+
+class Input :
+    class NOVALUES :
+        RANDOM,IGNORE,ALWAYS = ['random','ignore','always']
+    """
+    This class is designed to read data from a source and and perform a variet of operations :
+        - provide a feature space, and rows (matrix profile)
+        - a data index map
+    """
+
+    def __init__(self,**_args):
+        """
+        :param data    
+        :param store    data-store parameters/configuration
+        :param sql      sql query  that pulls a representative sample of the data
+        """
+        self._schema = _args['schema'] if 'schema' in _args else {}
+        #
+        # schema data should be in a hash map for these purposes
+        #
+        # if self._schema :
+        #     r = {}
+        #     for _item in self._schema :
+        #         r[_item['name']] = r[_item['type']]
+        # self._schema = r
+        
+        self.df = _args['data']
+        if 'sql' not in _args :
+            self._initdata(**_args)   
+            # 
+            pass  
+        else:
+            self._initsql(**_args)
+        #
+        # We need to have a means to map of values,columns and vector positions in order
+        # to perform convert and revert to and from binary
+        #
+        self._map = {} if 'map' not in _args else _args['map']
+        
+        
+    def _initsql(self,**_args):
+        """
+        This function will initialize the class on the basis of a data-store and optionally pre-defined columns to be used to be synthesized 
+        :param store        data-store configuration
+        :param columns      list of columns to be 
+        """
+        
+        if 'columns' not in _args :
+            self._initcols(data=self.df)
+        else:
+            self._initcols(data=self.df,columns=_args['columns'])
+        
+        pass     
+    def _init_map(self,values):
+        self._map = dict(zip(np.arange(len(values)),values))
+        for key in self._map :
+            self._map[key] = self._map[key].tolist()
+    def _initcols (self,**_args) :
+        """
+        This function will initialize the columns to be synthesized and/or determine which ones can be synthesized
+        :param data         data-frame that holds the data (matrix)
+        :param columns      optional columns to be synthesized 
+        """
+        # df = _args['data'].copy()
+        row_count = self.df.shape[0]
+        cols    = None if 'columns' not in _args else _args['columns']
+        self.columns = self.df.columns.tolist()
+        self._io = []
+
+        if 'columns' in _args :
+            self._columns = _args['columns']
+        # else:
+        #
+        # We will look into the count and make a judgment call
+        try:
+            # _df = pd.DataFrame(self.df.apply(lambda col: col.dropna().unique().size )).T
+            # MIN_SPACE_SIZE = 2
+            # self._columns  = cols if cols else _df.apply(lambda col:None if col[0] == row_count or col[0] < MIN_SPACE_SIZE else col.name).dropna().tolist()
+            # self._io = _df.to_dict(orient='records')
+            _df  = pd.DataFrame(self.df.nunique().T / self.df.shape[0]).T
+            self._io =  (_df.to_dict(orient='records'))
+            
+        except Exception as e:
+            print (e)
+            self._io = []
+    def _initdata(self,**_args):
+        """
+        This function will initialize the class with a data-frame and columns of interest (if any)
+        :param data       data-frame that holds the data
+        :param columns columns that need to be synthesized if any
+        """
+        self._initcols(**_args)
+
+    def _convert(self,**_args):
+        """
+        This function will convert a data-frame into a binary matrix and provide a map to be able to map the values back to the matrix 
+        :param columns  in case we specify the columns to account for (just in case the original assumptions don't hold)
+        """
+        if 'columns' in _args or 'column' in _args :
+            columns = _args['columns'] if 'columns' in _args else [_args['column']]
+        else:
+            columns = self._columns
+        _df = self.df if 'data' not in _args else _args['data']
+        #
+        # At this point we have the list of features we want to use
+        i = 0
+        
+        _m = np.array([])
+        _values  = []
+        for name in columns :
+            #
+            # In case we have  dataset with incomplete value space, we should still be able to generate something meaningful
+            #
+            values = None if name not in self._map else list(self._map[name]['values'])
+            _type = self._schema[name] if name in self._schema else _df[name].dtype
+            cols, _matrix = self.tobinary(_df[name],values)
+            _beg,_end = i,i+len(cols)
+            if name not in self._map :
+                self._map[name] = {"beg":_beg,"end":_end ,"values":cols.tolist()}
+            i += len(cols)
+            if not _m.shape[0]:
+                _m = _matrix ;
+            else:
+                _m = np.concatenate((_m,_matrix),axis=1)
+            if values :
+                _values += list(values)
+        #
+        # @NOTE:
+        # The map should allow us to be able to convert or reconvert the binary matrix to whatever we want ...
+        #
+        # self._matrix = _m
+        
+        return _values,_m
+        
+    def _revert(self,**_args) :
+        """
+        This function will take in a binary matrix and based on the map of values it will repopulate it with values
+        :param _matrix  binary matrix
+        :param column|columns   column name or columns if the column is specified
+        """
+        _column = _args['column'] if 'column' in _args else None
+            
+        
+        matrix = _args['matrix']
+        row_count = matrix.shape[0]
+        r = {}
+        for key in self._map :
+            if _column and key != _column :
+                continue
+            _item = self._map[key]
+            _beg = _item['beg']
+            _end = _item['end']
+            columns = np.array(_item['values'])
+            #
+            # @NOTE: We are accessing matrices in terms of [row,col], 
+            # The beg,end variables are for the columns in the matrix (mini matrix)
+            #
+            # if not _column :
+            #     _matrix = matrix[:,_beg:_end] #-- The understanding is that _end is not included
+            # else:
+                # _matrix = matrix
+            _matrix = matrix[:,_beg:_end]
+            #
+            # vectorize the matrix to replace the bits by their actual values (accounting for the data-types)
+            # @TODO: Find ways to do this on a GPU (for big data) or across threads
+            #
+            row_count = _matrix.shape[0]
+            # r[key] = [columns[np.where(row == 1) [0][0] ] for row in _matrix[:,_beg:_end]]
+            
+            r[key] = [columns[np.where(row==1)[0][0]] if np.where(row==1)[0].size > 0 else '' for row in _matrix]
+        #
+        # we should consider decoding the matrix if possible
+        #
+            
+        return pd.DataFrame(r)
+     
+    def tobinary(self,rows,cols=None) :
+        """
+        This function will compile a binary matrix from a row of values this allows hopefully this can be done in parallel, this function can be vectorized and processed 
+        :param rows     np.array or list of vector of values
+        :param cols     a space of values if it were to be different fromt he current sample.
+        """
+        if not cols:
+            #
+            # In the advent the sample rows do NOT have the values of the 
+            cols = rows.unique()
+        cols = np.array(cols)
+        row_count = np.int64(len(rows))
+        # if 'GPU' not in os.environ :
+        # _matrix = np.zeros([row_count,cols.size],dtype=int)
+        #
+        # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure)
+        #
+        _matrix = np.array([np.repeat(0,cols.size) for i in range(0,row_count)])
+        
+        [np.put(_matrix[i], np.where(cols ==  rows[i])  ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
+        # else:
+        #     _matrix = cp.zeros([row_count,cols.size])
+        #     [cp.put(_matrix[i], cp.where(cols ==  rows[i]),1)for i in cp.arange(row_count) ]
+        #     _matrix = _matrix.asnumpy()
+
+        
+        return cols,_matrix
+    def convert(self,**_args):
+        if 'columns' in _args or 'column' in _args :
+            columns = _args['columns'] if 'columns' in _args else [_args['column']]
+        else:
+            columns = self._columns
+        _df = self.df if 'data' not in _args else _args['data']
+        _values,_matrix = self.encode(_df,columns)
+        _, _matrix =  self.tobinary(_matrix)
+        self._init_map(_values)
+        return _values,_matrix  #-- matrix has been updated !
+    def revert(self,**_args):
+        # _columns = _args['column'] if 'column' in _args else None
+        _matrix = _args['matrix']
+        # print (_matrix)
+        return self.decode(_matrix,columns=self._columns)
+        pass
+    def encode(self,df,columns) :
+        _df = df[columns].drop_duplicates()
+        _values = _df.values.tolist()
+        _encoded = df[columns].apply(lambda row: _values.index( list(row)) ,axis=1)
+        return np.array(_values),_encoded
+    def decode (self,_matrix,**_args):
+        #
+        # _matrix   binary matrix
+        #
+      
+        columns = _args['columns']
+        _values = np.array( list(self._map.values()))
+        _matrix = pd.DataFrame(_matrix) #if type(_matrix) != pd.DataFrame else _matrix
+        # x = _matrix.apply(lambda row: _values[row.values == 1 ].tolist()[0] if row.values.sum() > 0 else None, axis=1).tolist()
+        #@TODO: Provide random values for things that are missing
+        
+        # x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else np.repeat(None,len(self._columns)) ,axis=1).tolist()
+        #
+        # @TODO: Provide a parameter to either:
+        #   - missing = {outlier,random,none}
+        #   -   outlier: select an outlier, random: randomly select a value, none: do nothing ...
+        # 
+        if np.random.choice([0,1],1)[0] :
+            novalues = _values[np.random.choice( len(_values),1)[0]].tolist()
+        else:
+            novalues = np.repeat(None,len(self._columns))        
+        x = _matrix.apply(lambda row: _values[row.values == 1].tolist()[0] if (row.values == 1).sum() > 0 else novalues ,axis=1).tolist()
+        return pd.DataFrame(x,columns=columns)
+
+
+
--- a/data/maker/prepare/main.py
+++ b/data/maker/prepare/main.py
@ -0,0 +1 @@
+__init__.py
--- a/data/maker/state/init.py
+++ b/data/maker/state/init.py
@ -0,0 +1,105 @@
+"""
+This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditiions
+"""
+"""
+This file handles state-space of the data training/generation process i.e Upon specification of the pre/post conditions,
+The specifications for this are as follows (within an entry of the configuration)
+    {
+        "state":{
+            "pre":[{"approximate":{"field":"int"}},{"newdate":{"field":"format"}}],"post":[{"limit":10}]
+        }
+    }
+"""
+import importlib
+import importlib.util
+import sys
+from datetime import datetime
+from data.maker.state.default import *
+import os
+
+
+class State :
+    @staticmethod
+    def apply(_data,lpointers):
+        """
+        This function applies a pipeline against a given data-frame, the calling code must decide whether it is a pre/post
+        :_data  data-frame
+        :_lpointers functions modules returned by instance (module,_args)
+        """
+        for _item in lpointers :
+            if _item is None :
+                continue
+            
+            pointer = _item['module']
+            _args = _item['args']
+            
+            _data = pointer(_data,_args)
+        return _data
+    @staticmethod
+    def instance(_args):
+        pre = []
+        post=[]
+       
+        out  = {}
+        for key in _args :
+            #
+            # If the item has a path property is should be ignored
+            path  = _args[key]['path'] if 'path' in _args[key] else ''
+            out[key] = [ State._build(dict(_item,**{'path':path})) if 'path' not in _item else State._build(_item) for _item in _args[key]['pipeline']]
+            
+        return out
+        # if 'pre' in _args:
+        #     path  = _args['pre']['path'] if 'path' in _args['pre'] else ''
+            
+        #     pre = [ State._build(dict(_item,**{'path':path})) for _item in _args['pre']['pipeline']]
+        # else:
+        #     path  = _args['post']['path'] if 'path' in _args['post'] else ''
+
+        #     post = [ State._build(dict(_item,**{'path':path})) for _item in _args['post']['pipeline']]
+        # return {'pre':pre,'post':post}
+   
+    @staticmethod
+    def _extract(_entry):
+        
+        _name = list(set(_entry.keys()) - set(['path']) )
+        _name = _name[0]
+        path = _entry['path'] if 'path' in _entry and os.path.exists(_entry['path']) else ''
+        return {"module": _name,"args": _entry[_name],'name':_name,'path':path}
+        pass
+    @staticmethod
+    def _build(_args):
+        
+        _info = State._extract(_args)
+        # _info = dict(_args,**_info)
+        
+        _info['module'] = State._instance(_info)
+        return _info if _info['module'] is not None else None
+            
+    @staticmethod
+    def _instance(_args):
+        """
+            :path   optional path of the file on disk
+            :module   name of the function
+        """
+        
+        _name = _args['module']
+        
+        if 'path' in _args and os.path.exists(_args['path']):
+            path= _args['path']
+            
+            spec = importlib.util.spec_from_file_location(_name, path)
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+        else:
+            #
+            # Probably calling a built-in module (should be in this file)
+            
+            module = sys.modules['data.maker.state.default'] 
+       
+        return getattr(module,_name) if hasattr(module,_name) else None
+ 
+#
+# Adding a few custom functions that should be able to help ....
+# These functions can be called without specifying a path
+#
+
--- a/data/maker/state/default.py
+++ b/data/maker/state/default.py
@ -0,0 +1,116 @@
+"""
+This file contains default functions applied to a data-frame/dataset as pre/post processing jobs. 
+The functions are organized in a pipeline i.e the data will be applied to each function 
+
+Custom functions :
+    functions must tak 2 arguments (_data,_args) : where _data is a data frame and _arg is a object describing the input parameters
+"""
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+
+
+def limit(_data,size):
+    """
+        ...,{limit:size}
+    """
+    
+    # size = int(_args['limit'])
+    return _data.iloc[:size]
+def format(_data,_schema):
+    """
+    This function enforces a schema against a data-frame, this may or may not work depending on the persistence storage
+    :_data  data-frame containing all data
+    :_args  schema to enforce the data, we are expecting the format as a list of {name,type,description}    
+    """   
+    return _data 
+
+def approximate(_data,_args):
+    """
+    :_args  Object of {field:type}
+    This function will approximate n-fields in the data given it's distribution
+    """
+    _m = {'int':int,'float':float,'integer':int,'double':float}
+    columns = list(_args.keys())
+    for _name in columns :
+        if _name not in _data :
+            continue
+        otype = _args[_name]
+        otype = str if otype not in _m else _m[otype]
+        _data.loc[:,_name] = np.random.uniform(_data[_name].values).astype(otype)
+        
+    return _data
+def split_date(_data,_args):
+    """
+    This function takes a field and applies the format from other fields
+    :_data  data-frame
+    :_config    configuration entry {column:{format,column:format,type}}
+    """
+    _columns = list(_args.keys())
+    _m = {'int':int,'float':float,'integer':int,'double':float}
+    for _name in _columns :
+        _iname = _args[_name]['column']
+        _iformat = _args[_name]['format']['in']
+        _oformat = _args[_name]['format']['out']
+        _otype = str if 'type' not in _args[_name]  else _args[_name]['type']
+        _data.loc[:,_name] = _data[_iname].apply(lambda _date: datetime.strftime(datetime.strptime(str(_date),_iformat),_oformat)).astype(_otype)
+    return _data
+def newdate(_data,_args):
+    """
+    This function creates a new data on a given column from another 
+    :_data  data frame
+    :_args  configuration column:{format,column}
+    """
+    _columns = list(_args.keys())
+    for _name in _columns :
+        
+        format = _args[_name]['format']
+        ROW_COUNT = _data[_name].size
+        if 'column' in _args[_name] :
+            srcName = _args[_name]['column']
+            years = _data[srcName].values
+        else:
+            years = np.random.choice(np.arange(datetime.now().year- 90,datetime.now().year),ROW_COUNT)
+        _data.loc[:,_name] = [ _makedate(year = years[_index],format = format) for _index in np.arange(ROW_COUNT)]
+        
+    return _data
+def _makedate(**_args):
+    """
+    This function creates a new date and applies it to a column
+    :_data  data-frame with columns
+    :_args  arguments for col1:format
+    """
+    _columns = list(_args.keys())
+   
+    # if _args['year'] in ['',None,np.nan] :
+    #     year = np.random.choice(np.arange(1920,222),1)
+    # else:
+    #     year = int(_args['year'])
+    year = int(_args['year'])
+    offset = _args['offset'] if 'offset' in _args else 0
+    month   = np.random.randint(1,13)
+    if month == 2:
+        _end = 28 if year % 4 != 0 else 29
+    else:
+        _end = 31 if month in [1,3,5,7,8,10,12] else 30
+    day = np.random.randint(1,_end)
+
+    #-- synthetic date
+    _date = datetime(year=year,month=month,day=day,minute=0,hour=0,second=0)
+    FORMAT =  '%Y-%m-%d'
+    
+    if 'format' in _args:
+        FORMAT = _args['format']
+    
+
+    # print ([_name,FORMAT, _date.strftime(FORMAT)])
+    r = []
+    if offset :
+        r = [_date.strftime(FORMAT)]
+        for _delta in offset :
+            _date = _date + timedelta(_delta)
+            r.append(_date.strptime(FORMAT))
+        return r
+    else:
+        return _date.strftime(FORMAT)
+
--- a/data/params.py
+++ b/data/params.py
@ -1,6 +1,6 @@
 import sys

-SYS_ARGS  = {'context':''}
+SYS_ARGS  = {}
 if len(sys.argv) > 1:
 	
 	N = len(sys.argv)
@ -9,8 +9,10 @@ if len(sys.argv) > 1:
 		if sys.argv[i].startswith('--'):
 			key = sys.argv[i][2:] #.replace('-','')
 			SYS_ARGS[key] = 1
-			if i + 1 < N:
+			if i + 1 < N and not sys.argv[i + 1].startswith('--'):
 				value = sys.argv[i + 1] = sys.argv[i+1].strip()
+			else:
+				value = None
 			if key and value:
 				SYS_ARGS[key] = value
 				
--- a/drive/pipeline.py
+++ b/drive/pipeline.py
@ -0,0 +1,303 @@
+#!/usr/bin/env python3
+import json
+from transport import factory
+import numpy as np
+import os
+from multiprocessing import Process
+import pandas as pd
+from google.oauth2 import service_account
+import data.maker
+
+from data.params import SYS_ARGS 
+
+#
+# The configuration array is now loaded and we will execute the pipe line as follows
+DATASET='combined20190510'
+
+class Components :
+
+	@staticmethod
+	def get(args):
+		"""
+		This function returns a data-frame provided a bigquery sql statement with conditions (and limits for testing purposes)
+		The function must be wrapped around a lambda this makes testing easier and changing data stores transparent to the rest of the code. (Vital when testing)
+		:sql	basic sql statement
+		:condition	optional condition and filters
+		"""
+		SQL = args['sql']
+		if 'condition' in args :
+			condition = ' '.join([args['condition']['field'],args['condition']['qualifier'],'(',args['condition']['value'],')'])
+			SQL = " ".join([SQL,'WHERE',condition])
+
+		SQL = SQL.replace(':dataset',args['dataset']) #+ " LIMIT 1000 "
+		if 'limit' in args :
+			SQL = SQL + 'LIMIT ' + args['limit']
+		credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
+		return df
+		
+		# return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna()
+	@staticmethod
+	def split(X,MAX_ROWS=3,PART_SIZE=3):
+		
+		return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories)
+
+	def train(self,**args):
+		"""
+		This function will perform training on the basis of a given pointer that reads data
+
+		"""
+		#
+		# @TODO: we need to log something here about the parameters being passed
+		pointer  = args['reader'] if 'reader' in args else lambda: Components.get(**args)
+		df = pointer()
+
+		#
+		# Now we can parse the arguments and submit the entire thing to training
+		#
+		
+		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+		log_folder = args['logs'] if 'logs' in args else 'logs'
+		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
+		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+
+		MAX_ROWS = args['max_rows'] 	if 'max_rows' in args else 0
+		PART_SIZE = args['part_size'] 	if 'part_size' in args else 0
+		
+		if df.shape[0] > MAX_ROWS and 'partition' not in args:
+			lbound = 0
+			bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
+			# bounds = Components.split(df,MAX_ROWS,PART_SIZE)
+			
+			qwriter = factory.instance(type='queue.QueueWriter',args={'queue':'aou.io'})
+			
+			for b in bounds :
+				part_index = bounds.index(b)
+				ubound = int(b.right)
+				
+					
+				_data =  df.iloc[lbound:ubound][args['columns']]
+				lbound = ubound
+				
+				# _args['logs'] = os.sep.join([log_folder,str(part_index)])
+				_args['partition'] = str(part_index)
+				_args['logger'] = {'args':{'dbname':'aou','doc':args['context']},'type':'mongo.MongoWriter'}
+				#
+				# We should post the the partitions to a queue server (at least the instructions on ):
+				#	- where to get the data
+				#	- and athe arguments to use (partition #,columns,gpu,epochs)
+				#
+				info = {"rows":_data.shape[0],"cols":_data.shape[1], "paritition":part_index,"logs":_args['logs']}
+				p = {"args":_args,"data":_data.to_dict(orient="records"),"info":info}
+				qwriter.write(p)
+				#
+				# @TODO:
+				#	- Notify that information was just posted to the queue
+				info['max_rows'] 	= MAX_ROWS
+				info['part_size'] 	= PART_SIZE
+				logger.write({"module":"train","action":"setup-partition","input":info})
+			
+			pass
+		else:
+			partition = args['partition'] if 'partition' in args else ''
+			log_folder = os.sep.join([log_folder,args['context'],partition])
+			_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+			_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
+			_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+			os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
+			
+			_args['data'] = df
+			#
+			# @log :
+			#	Logging information about the training process for this partition (or not)
+			#
+			info = {"rows":df.shape[0],"cols":df.shape[1], "partition":partition,"logs":_args['logs']}
+			logger.write({"module":"train","action":"train","input":info})
+			data.maker.train(**_args)
+
+		pass
+		
+	# @staticmethod
+	def generate(self,args):
+		"""
+		This function will generate data and store it to a given,
+		"""
+		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+		log_folder = args['logs'] if 'logs' in args else 'logs'
+		partition = args['partition'] if 'partition' in args else ''
+		log_folder = os.sep.join([log_folder,args['context'],partition])
+		_args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
+		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
+		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
+		_args['no_value']= args['no_value']
+		MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
+		PART_SIZE = args['part_size'] 	if 'part_size' in args else 0
+		
+		# credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+		# _args['data'] = pd.read_gbq(SQL,credentials=credentials,dialect='standard').dropna()
+		reader = args['reader']
+		df = reader()
+		if 'partition' in args :
+			bounds = Components.split(df,MAX_ROWS,PART_SIZE)
+			# bounds = list(pd.cut( np.arange(df.shape[0]+1),PART_SIZE).categories)
+			lbound = int(bounds[int(partition)].left)
+			ubound = int(bounds[int(partition)].right)
+			df = df.iloc[lbound:ubound]
+		_args['data'] = df
+			# _args['data'] = reader()
+		#_args['data'] = _args['data'].astype(object)
+		_args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1 
+		_dc = data.maker.generate(**_args) 
+		#
+		# We need to post the generate the data in order to :
+		#	1. compare immediately
+		#	2. synthetic copy
+		#
+		
+		cols = _dc.columns.tolist()
+		
+		data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io')				#-- will be used for comparison (store this in big query)
+		base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
+		
+		for name in cols :
+			_args['data'][name] = _dc[name]
+			info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
+			if partition != '' :
+				info['partition'] = partition
+			logger.write(info)
+			# filename = os.sep.join([log_folder,'output',name+'.csv'])
+			# data_comp[[name]].to_csv(filename,index=False)
+
+		#
+		#-- Let us store all of this into bigquery
+		prefix = args['notify']+'.'+_args['context']
+		table = '_'.join([prefix,partition,'io']).replace('__','_')
+		folder = os.sep.join([args['logs'],args['context'],partition,'output']) 
+		if 'file' in args :
+			
+			_fname = os.sep.join([folder,table.replace('_io','_full_io.csv')])
+			_pname = os.sep.join([folder,table])+'.csv'
+			data_comp.to_csv( _pname,index=False)
+			_args['data'].to_csv(_fname,index=False)
+			
+			
+		else:
+			credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+			_pname = os.sep.join([folder,table+'.csv'])
+			_fname = table.replace('_io','_full_io')
+			data_comp.to_gbq(if_exists='replace',destination_table=_pname,credentials='credentials',chunk_size=50000)	
+			data_comp.to_csv(_pname,index=False)
+			INSERT_FLAG = 'replace' if 'partition' not in args else 'append'	
+			_args['data'].to_gbq(if_exists=INSERT_FLAG,destination_table=_fname,credentials='credentials',chunk_size=50000)
+		
+		info = {"full":{"path":_fname,"rows":_args['data'].shape[0]},"compare":{"name":_pname,"rows":data_comp.shape[0]} }
+		if partition :
+			info ['partition'] = partition
+		logger.write({"module":"generate","action":"write","info":info} )
+	@staticmethod
+	def callback(channel,method,header,stream):
+		
+		info = json.loads(stream)
+		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':SYS_ARGS['context']})
+		
+		logger.write({'module':'process','action':'read-partition','input':info['info']})
+		df = pd.DataFrame(info['data'])
+		args = info['args']
+		if int(args['num_gpu']) > 1 and args['gpu'] > 0:
+			args['gpu'] = args['gpu'] + args['num_gpu']
+		args['reader'] = lambda: df
+		#
+		# @TODO: Fix
+		# 	There is an inconsistency in column/columns ... fix this shit!
+		#
+		args['columns'] = args['column']		
+		(Components()).train(**args)		
+		logger.write({"module":"process","action":"exit","info":info["info"]})
+		channel.close()
+		channel.connection.close()
+		pass
+	
+if __name__ == '__main__' :
+	filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json'
+	f = open (filename)
+	PIPELINE = json.loads(f.read())
+	f.close()	
+	index = int(SYS_ARGS['index']) if 'index' in SYS_ARGS else 0
+	
+	args =  (PIPELINE[index])
+	args['dataset'] = 'combined20190510'
+	args = dict(args,**SYS_ARGS)
+	args['max_rows'] = int(args['max_rows']) 	if 'max_rows' in args else 3
+	args['part_size']= int(args['part_size']) 	if 'part_size' in args else 3
+
+	#
+	# @TODO:
+	#	Log what was initiated so we have context of this processing ...
+	#
+	if 'listen' not in SYS_ARGS :
+		if 'file' in args :
+			reader = lambda: pd.read_csv(args['file']) ;
+		else:
+			reader = lambda: Components().get(args)
+		args['reader'] = reader
+	
+	if 'generate' in SYS_ARGS :
+		#
+		# Let us see if we have partitions given the log folder
+		
+		content = os.listdir( os.sep.join([args['logs'],args['context']]))
+		generator = Components()
+		if ''.join(content).isnumeric() :
+			#
+			# we have partitions we are working with
+			
+			for id in ''.join(content) :
+				args['partition'] = id
+				
+				generator.generate(args)
+		else:
+			generator.generate(args)
+		# Components.generate(args)
+	elif 'listen' in args :
+		#
+		# This will start a worker just in case to listen to a queue
+		if 'read' in SYS_ARGS :
+			QUEUE_TYPE = 'queue.QueueReader'
+			pointer = lambda qreader: qreader.read(1)
+		else:
+			QUEUE_TYPE = 'queue.QueueListener'
+			pointer = lambda qlistener: qlistener.listen()
+		N = int(SYS_ARGS['jobs']) if 'jobs' in SYS_ARGS else 1 
+		
+		qhandlers = [factory.instance(type=QUEUE_TYPE,args={'queue':'aou.io'}) for i in np.arange(N)]
+		jobs = []
+		for qhandler in qhandlers :
+			qhandler.callback = Components.callback
+			job = Process(target=pointer,args=(qhandler,))
+			job.start()
+			jobs.append(job)
+		#
+		# let us wait for the jobs
+		print (["Started ",len(jobs)," trainers"])
+		while len(jobs) > 0 :
+			
+			jobs = [job for job in jobs if job.is_alive()]
+		
+		# pointer(qhandler)
+
+
+		# qreader.read(1)
+		pass
+	else:
+		
+		trainer = Components()
+		trainer.train(**args)
+		# Components.train(**args)
+#for args in PIPELINE :
+	#args['dataset'] = 'combined20190510'
+	#process = Process(target=Components.train,args=(args,))
+	#process.name = args['context']
+	#process.start()
+#	Components.train(args)
--- a/pipeline.py
+++ b/pipeline.py
@ -0,0 +1,692 @@
+#!/usr/bin/env python3
+import json
+from transport import factory
+import numpy as np
+import time
+import os
+from multiprocessing import Process, Lock
+import pandas as pd
+from google.oauth2 import service_account
+from google.cloud import bigquery as bq
+import data.maker
+import copy
+from data.params import SYS_ARGS 
+                 
+#
+# The configuration array is now loaded and we will execute the pipe line as follows
+
+class Components :
+	lock = Lock()
+	class KEYS :
+		PIPELINE_KEY = 'pipeline'
+		SQL_FILTER = 'filter'
+	@staticmethod
+	def get_filter (**args):
+		if args['qualifier'] == 'IN' :
+			return ' '.join([args['field'],args['qualifier'],'(',args['value'],')'])
+		else:
+			return ' '.join([args['field'],args['qualifier'],args['value']])
+	@staticmethod
+	def get_logger(**args) :
+		return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+	@staticmethod
+	def get(args):
+		"""
+		This function returns a data-frame provided a bigquery sql statement with conditions (and limits for testing purposes)
+		The function must be wrapped around a lambda this makes testing easier and changing data stores transparent to the rest of the code. (Vital when testing)
+		:sql	basic sql statement
+		:condition	optional condition and filters
+		"""
+		SQL = args['sql']
+		if Components.KEYS.SQL_FILTER in args :
+			FILTER_KEY = Components.KEYS.SQL_FILTER
+			SQL_FILTER = args[FILTER_KEY] if type(args[FILTER_KEY]) == list else [args[FILTER_KEY]]
+			# condition = ' '.join([args[FILTER_KEY]['field'],args[FILTER_KEY]['qualifier'],'(',args[FILTER_KEY]['value'],')'])
+			
+			condition = ' AND '.join([Components.get_filter(**item) for item in SQL_FILTER])
+			SQL = " ".join([SQL,'WHERE',condition])
+
+		SQL = SQL.replace(':dataset',args['dataset']) #+ " LI "
+		
+		if 'limit' in args :
+			SQL = SQL + ' LIMIT ' + args['limit']
+		#
+		# let's log the sql query that has been performed here
+		logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+		logger.write({"module":"bigquery","action":"read","input":{"sql":SQL}})
+		credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
+		df = pd.read_gbq(SQL,credentials=credentials,dialect='standard')
+		return df
+		
+		# return lambda: pd.read_gbq(SQL,credentials=credentials,dialect='standard')[args['columns']].dropna()
+	@staticmethod
+	def split(X,MAX_ROWS=3,PART_SIZE=3):
+		
+		return list(pd.cut( np.arange(X.shape[0]+1),PART_SIZE).categories)
+	def format_schema(self,schema):
+		_schema = {}
+		for _item in schema :                
+			_type = int
+			_value = 0
+			if _item.field_type == 'FLOAT' :
+				_type =float
+			elif _item.field_type != 'INTEGER' :
+				_type = str
+				_value = ''
+			_schema[_item.name] = _type	
+		return _schema	
+	def get_ignore(self,**_args) :
+		if 'columns' in _args and 'data' in _args :
+			_df = _args['data']
+			terms = _args['columns']
+			return [name for name in _df.columns if  np.sum( [int(field in  name )for field in terms ])  ]
+			
+		return []
+	def set_gpu(self,**_args) :
+		if 'gpu' in _args :
+			gpu = _args['gpu'] if type(_args['gpu']) != str else [_args['gpu']]
+			_index = str(gpu[0])
+			os.environ['CUDA_VISIBLE_DEVICES'] = _index
+			return gpu
+		else :
+			return None
+	def train(self,**args):
+		"""
+		This function will perform training on the basis of a given pointer that reads data
+
+		"""
+		schema = None
+		if 'file' in args :
+			
+			df = pd.read_csv(args['file'])
+			del args['file']
+		elif 'data' not in args :
+			
+			reader = factory.instance(**args['store']['source'])
+			
+			
+			if 'row_limit' in args :
+				df = reader.read(sql=args['sql'],limit=args['row_limit'])
+			else:
+				df = reader.read(sql=args['sql'])		
+			schema = reader.meta(table=args['from']) if hasattr(reader,'meta') and 'from' in args else None
+		else:
+			df = args['data']
+		
+		#
+		#
+		# df = df.fillna('')
+		if schema :
+			_schema = []
+			for _item in schema :                
+				_type = int
+				_value = 0
+				if _item.field_type == 'FLOAT' :
+					_type =float
+				elif _item.field_type != 'INTEGER' :
+					_type = str
+					_value = ''
+				_schema += [{"name":_item.name,"type":_item.field_type}]
+				df[_item.name] = df[_item.name].fillna(_value).astype(_type)
+			args['schema'] = _schema
+		# 		df[_item.name] = df[_item.name].astype(_type)
+		_args = copy.deepcopy(args)
+		# _args['store']  = args['store']['source']
+		_args['data'] = df
+		#
+		# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
+		if 'continuous' in args :
+			x_cols = args['continuous']
+		else:
+			x_cols = []
+
+		if 'ignore' in args and 'columns' in args['ignore'] :
+			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
+			_args['data'] = df[ list(set(df.columns)- set(_cols))]
+		#
+		# We need to make sure that continuous columns are removed 
+		if x_cols :
+			_args['data'] = _args['data'][list(set(_args['data'].columns) - set(x_cols))]
+		if 'gpu' in args :
+			_args['gpu'] = self.set_gpu(gpu=args['gpu'])
+		if 'partition' in args :
+			_args['partition'] = args['partition']
+		if df.shape[0] and df.shape[0] :
+			#
+			# We have a full blown matrix to be processed 
+			print ('-- Training --')
+			data.maker.train(**_args)
+		else:
+			print ("... skipping training !!")
+		
+		if 'autopilot' in ( list(args.keys())) :
+			
+			args['data'] = df
+			print (['autopilot mode enabled ....',args['context']])
+			self.generate(args)
+
+		pass
+
+	def approximate(self,values):
+		"""
+		:param values	array of values to be approximated
+		"""
+		if values.dtype in [int,float] :
+			#
+			# @TODO: create bins?
+			r = np.random.dirichlet(values+.001) #-- dirichlet doesn't work on values with zeros
+			_sd = values[values > 0].std()
+			_me = values[values > 0].mean()
+			_mi = values.min()
+			x = []
+			_type = values.dtype
+			for index in np.arange(values.size) :
+				
+				if np.random.choice([0,1],1)[0] :
+					value = values[index] + (values[index] * r[index])
+					
+				else :
+					value = values[index] - (values[index] * r[index])
+				#
+				# randomly shifting the measurements 
+				if np.random.choice([0,1],1)[0] and _me > _sd :
+					if np.random.choice([0,1],1)[0] :
+						value = value * np.divide(_me,_sd)
+					else:
+						value = value + (np.divide(_me,_sd))
+				value = int(value) if _type == int else np.round(value,2)
+				x.append( value)
+			np.random.shuffle(x)
+			return np.array(x)
+		else:
+			return values
+		pass
+			
+	def shuffle(self,_args):
+		if 'data' in args :
+			df = data['data'] 
+		else:
+			reader = factory.instance(**args['store']['source'])
+			if 'file' in args :
+				df = pd.read_csv(args['file'])
+			elif 'data' in _args :
+				df = _args['data']
+			else:
+				if 'row_limit' in args and 'sql' in args:
+					df = reader.read(sql=args['sql'],limit=args['row_limit'])
+				else:
+					df = reader.read(sql=args['sql'])
+			schema = None
+			if 'schema' not in args and hasattr(reader,'meta') and 'file' not in args:
+				schema = reader.meta(table=args['from'])
+				schema = [{"name":_item.name,"type":_item.field_type} for _item in schema]
+		#
+		# We are shufling designated colmns and will be approximating the others
+		#
+		x_cols 	= []	#-- coumns tobe approximated.
+		_cols 	= []	#-- columns to be ignored
+		if 'continuous' in args :
+			x_cols = args['continuous']
+		if 'ignore' in args and 'columns' in args['ignore'] :
+			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
+		
+		columns = args['columns'] if 'columns' in args else df.columns
+		columns = list(set(columns) - set(_cols))
+		for name in columns:
+			i = np.arange(df.shape[0])
+			np.random.shuffle(i)
+			if name in x_cols :
+				if df[name].unique().size > 0 :
+					df[name] = self.approximate(df.iloc[i][name].fillna(0).values)
+			# df[name] = df[name].astype(str)
+		# 	pass
+			
+		df.index = np.arange(df.shape[0])		
+		self.post(data=df,schema=schema,store=args['store']['target'])	
+	def post(self,**_args) :
+		table = _args['from'] if 'from' in _args else _args['store']['table']
+		_schema = _args['schema'] if 'schema' in _args else None
+		writer 	= factory.instance(**_args['store'])
+		_df 	= _args['data']
+		if _schema :
+			columns = []
+			for _item in _schema :
+				name = _item['name']
+				_type = str
+				_value = 0
+				if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] :
+					if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
+						#
+						# There is an issue with missing dates that needs to be resolved.
+						# for some reason a missing date/time here will cause the types to turn into timestamp (problem)
+						#	The following is a hack to address the issue (alas) assuming 10 digit dates and 'NaT' replaces missing date values (pandas specifications)
+						#
+						_df[name] = _df[name].apply(lambda value: None if str(value) == 'NaT' else (str(value)[:10]) if _item['type'] in ['DATE','DATETIME'] else str(value))
+						#_df[name] = _df[name].dt.date
+						# _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce')
+					else:
+						pass
+						_df[name] = pd.to_datetime(_df[name])
+				else:
+					value = 0
+					if _item['type'] == 'INTEGER' :
+						_type = np.int64
+					elif _item['type'] in ['FLOAT','NUMERIC']:
+						_type = np.float64
+					else:
+						
+						_value = ''
+					_df[name] = _df[name].fillna(_value) #.astype(_type)
+				columns.append(name)
+		
+		fields = _df.columns.tolist()
+		if not writer.has(table=table) and _args['store']['provider'] != 'bigquery':
+			
+			_map = {'STRING':'VARCHAR(256)','INTEGER':'BIGINT'} if 'provider' in _args['store'] and _args['store']['provider'] != 'bigquery' else {}
+			_params = {'map':_map,'table':args['from']}
+			if _schema :
+				_params['schema'] = _schema
+				
+			else:
+				_params['fields'] = fields
+			
+			writer.make(**_params)
+			
+		fields = _df.columns.tolist()
+		_df = _df[fields]
+		# writer.fields = fields
+		if _args['store']['provider'] == 'bigquery' :
+			print (['_______ POSTING ______________ ',table])
+			print (['_______________ ',_df.shape[0],' ___________________'])
+			writer.write(_df.astype(object),schema=_schema,table=table)
+		else:
+			writer.table = table	
+			writer.write(_df)
+		# else:
+		# 	writer.write(_df,table=args['from'])
+		
+
+	def finalize(self,args):
+		"""
+		This function performs post-processing opertions on a synthetic table i.e :
+			- remove duplicate keys
+			- remove orphaned keys i.e 
+		"""
+		reader = factory.instance(**args['store']['source'])
+		logger = factory.instance(**args['store']['logs'])
+		
+		target = args['store']['target']['args']['dataset'] 
+		source = args['store']['source']['args']['dataset'] 
+		table = args['from']
+		schema = reader.meta(table=args['from'])
+		#
+		# keys :
+		unique_field = "_".join([args['from'],'id']) if 'unique_fields' not in args else args['unique_fields']
+		fields = [ item.name if item.name != unique_field else "y."+item.name for item in schema]
+		SQL = [
+			"SELECT :fields FROM ",
+			"(SELECT ROW_NUMBER() OVER() AS row_number,* FROM :target.:table) x","INNER JOIN",
+			"(SELECT ROW_NUMBER() OVER() AS row_number, :unique_field FROM :source.:table ORDER BY RAND()) y",
+			"ON y.row_number = x.row_number"
+		]
+		SQL = " ".join(SQL).replace(":fields",",".join(fields)).replace(":table",table).replace(":source",source).replace(":target",target)
+		SQL = SQL.replace(":unique_field",unique_field)
+		#
+		# Use a native job to get this done ...
+		#
+		client      = bq.Client.from_service_account_json(args['store']['source']['args']["private_key"])
+		job = bq.QueryJobConfig()
+		job.destination = client.dataset(target).table(table)
+		job.use_query_cache = True
+		job.allow_large_results = True
+		# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
+		job.write_disposition = "WRITE_TRUNCATE"
+		job.priority = 'BATCH'
+		r = client.query(SQL,location='US',job_config=job)		
+		logger.write({"job":r.job_id,"action":"finalize", "args":{"sql":SQL,"source":"".join([source,table]),"destimation":".".join([target,table])}})
+		#
+		# Keep a log of what just happened...
+		#
+		otable = ".".join([args['store']['source']['args']['dataset'],args['from']])
+		dtable = ".".join([args['store']['target']['args']['dataset'],args['from']])
+	def generate(self,args):
+		"""
+		This function will generate data and store it to a given,
+		"""
+		store = args['store']['logs']
+		if 'args' in store :
+			store['args']['doc'] = args['context']
+		else:
+			store['doc'] = args['context']
+		logger = factory.instance(**store) #type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
+
+		ostore = args['store']['target']
+		writer = factory.instance(**ostore)
+
+		schema = args['schema'] if 'schema' in args else None
+		if 'data' in args :
+			
+			df = args['data']
+		else:
+
+			reader = factory.instance(**args['store']['source'])
+			if 'row_limit' in args :
+				df = reader.read(sql=args['sql'],limit=args['row_limit'])
+			else:
+				df = reader.read(sql=args['sql'])
+			if 'schema' not in args and hasattr(reader,'meta'):
+				schema = reader.meta(table=args['from'])
+				schema = [{"name":_item.name,"type":_item.field_type} for _item in schema]
+
+			# else:
+			# 	#
+			# 	# This will account for autopilot mode ...
+			# 	df = args['data']
+		_cast = {}
+		if schema :
+			
+			for _item in schema :
+				dtype = str
+				name = _item['name']
+				novalue = 0
+				if _item['type'] in ['INTEGER','NUMERIC']:
+					dtype = np.int64
+					
+				elif _item['type'] == 'FLOAT' :
+					dtype = np.float64
+				else:
+					novalue = ''
+				# _cast[schema['name']] = dtype
+				df[name] = df[name].fillna(novalue).astype(dtype)
+
+		_info = {"module":"gan-prep","action":"read","shape":{"rows":df.shape[0],"columns":df.shape[1]},"schema":schema}
+		logger.write(_info)
+		
+				
+		_dc = pd.DataFrame()
+		# for mdf in df :
+		args['data'] = df.copy()
+		#
+		# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
+		if 'continuous' in args :
+			x_cols = args['continuous']
+		else:
+			x_cols = []
+
+		if 'ignore' in args and 'columns' in args['ignore'] :
+			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
+			args['data'] = args['data'][ list(set(df.columns)- set(_cols))]
+		#
+		# We need to remove the continuous columns from the data-frame
+		# @TODO: Abstract this !!
+		#
+		real_df = pd.DataFrame()
+		if x_cols :
+			args['data'] = args['data'][list(set(args['data'].columns) - set(x_cols))]
+			real_df = df[x_cols].copy()
+		
+		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
+		if 'gpu' in args :
+			args['gpu'] = self.set_gpu(gpu=args['gpu'])
+		# if 'partition' in args :
+		# 	args['logs'] = os.sep.join([args['logs'],str(args['partition'])])
+		
+		_info = {"module":"gan-prep","action":"prune","shape":{"rows":args['data'].shape[0],"columns":args['data'].shape[1]}}
+		logger.write(_info)
+		if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 :
+			candidates = (data.maker.generate(**args))
+			
+		else:
+			candidates = [df]
+		
+		# if 'sql.BQWriter' in ostore['type'] :
+		_columns = None
+		skip_columns = []
+		_schema = schema
+		if schema :
+			cols = [_item['name'] for _item in _schema]
+		else:
+			cols = df.columns.tolist()
+		_info = {"module":"gan-prep","action":"selection","input":{"candidates":len(candidates),"features":cols}}
+		logger.write(_info)
+		for _df in candidates :
+			#
+			# we need to format the fields here to make sure we have something cohesive
+			#
+
+			if not skip_columns :
+				if 'ignore' in args and 'columns' in args['ignore'] :
+						skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns'])
+			#
+			# We perform a series of set operations to insure that the following conditions are met:
+			#	- the synthetic dataset only has fields that need to be synthesized
+			#	- The original dataset has all the fields except those that need to be synthesized
+			#
+			
+			_df = _df[list(set(_df.columns)  - set(skip_columns))].copy()
+			if x_cols :
+				_approx = {}
+				for _col in x_cols :
+					if real_df[_col].unique().size > 0 :
+						
+
+						_df[_col] = self.approximate(real_df[_col].values)
+						_approx[_col] = {
+							"io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)},
+							"real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}
+						}
+					else:
+						_df[_col] = -1
+				logger.write({"module":"gan-generate","action":"approximate","status":_approx})				
+			if set(df.columns) & set(_df.columns) :
+				_columns = list(set(df.columns) - set(_df.columns))											
+				df = df[_columns]
+
+			#
+			# Let us merge the dataset here and and have a comprehensive dataset
+
+			_df = pd.DataFrame.join(df,_df)
+			_params = {'data':_df,'store' : ostore,'from':args['from']}
+			if _schema :
+				_params ['schema'] = _schema
+			_info = {"module":"gan-prep","action":"write","input":{"rows":_df.shape[0],"cols":_df.shape[1]}}
+			logger.write(_info)
+			self.post(**_params)
+			# print (['_______ posting _________________',_df.shape])
+			break
+			
+			
+			pass
+		# else:
+		# 	pass
+	def bind(self,**_args):
+		print (_args)
+	
+	
+if __name__ == '__main__' :
+	filename = SYS_ARGS['config'] if 'config' in SYS_ARGS else 'config.json'
+	f = open (filename)
+	_config = json.loads(f.read())
+	f.close()	
+	PIPELINE = _config['pipeline']
+	index = SYS_ARGS['index']
+	if index.isnumeric() :
+		index = int(SYS_ARGS['index']) 
+	else:
+		#
+		# The index provided is a key to a pipeline entry mainly the context
+		#
+		N = len(PIPELINE)
+		f = [i for i in range(0,N) if PIPELINE[i]['context'] == index]
+		index = f[0] if f else 0
+	#
+	
+	print ("..::: ",PIPELINE[index]['context'],':::..')
+	args =  (PIPELINE[index])
+	for key in _config :
+		if key == 'pipeline' or key in args:
+			#
+			# skip in case of pipeline or if key exists in the selected pipeline (provided by index)
+			# 
+			continue
+		args[key] = _config[key]
+	
+	args = dict(args,**SYS_ARGS)
+	if 'matrix_size' in args :
+		args['matrix_size'] = int(args['matrix_size'])
+	if 'batch_size' not in args :
+		args['batch_size']	= 2000 #if 'batch_size' not in args else int(args['batch_size'])
+	if 'dataset' not in args :
+		args['dataset'] = 'combined20191004v2_deid'
+	args['logs'] = args['logs'] if 'logs' in args else 'logs'
+	PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
+	#
+	# @TODO:
+	#	Log what was initiated so we have context of this processing ...
+	#
+
+	GPU_CHIPS = args['gpu'] if 'gpu' in args else None
+	if GPU_CHIPS and type(GPU_CHIPS) != list :				
+		GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS]
+	if 'gpu' in SYS_ARGS :
+		args['gpu'] = GPU_CHIPS
+	jobs = []
+	if 'generate' in SYS_ARGS :
+		#
+		# Let us see if we have partitions given the log folder
+		
+		content = os.listdir( os.sep.join([args['logs'],'train',args['context']]))
+		if 'all-chips' in SYS_ARGS and GPU_CHIPS:
+			index = 0
+			jobs = []
+			for _gpu in GPU_CHIPS :
+				_args = copy.deepcopy(args)
+				_args['gpu'] = [int(_gpu)]
+				_args['partition'] = int(_gpu) #index
+				index += 1
+				make = lambda _params: (Components()).generate(_params)
+				job = Process(target=make,args=( dict(_args),))
+				job.name = 'Trainer # ' + str(index)
+				job.start()
+				jobs.append(job)
+			pass
+		else:
+			
+			generator = Components()
+			generator.generate(args)
+	elif 'bind' in SYS_ARGS :
+		import binder
+		_args = _config['_map']
+		_args['store'] = copy.deepcopy(_config['store'])
+		if 'init' in SYS_ARGS :
+			#
+			# Creating and persisting the map ...
+			print (['.... Binding Initialization'])
+			# jobs = binder.Init(**_args)
+			_mapped = binder.Init(**_args)
+			
+			
+			_schema = [{"name":_name,"type":"INTEGER"} for _name in _mapped.columns.tolist()]
+			publisher = lambda _params: (Components()).post(**_params)
+			_args = {'data':_mapped,'store':_config['store']['target']}
+			_args['store']['table'] = '_map'
+			if _args['store']['provider'] =='bigquery' :
+				_args['schema'] = _schema
+			
+			job = Process (target = publisher,args=(_args,))
+			job.start()
+			jobs = [job]
+		else:
+			#
+			# Applying the map of k on a particular dataset
+			#
+			index = int(SYS_ARGS['index'])
+			_args['config'] = _config['pipeline'][index]
+			_args['original_key'] = 'person_id' if 'original_key' in _config else 'person_id'
+			table = _config['pipeline'][index]['from']
+			_df =  binder.ApplyOn(**_args)
+			_df = np.array_split(_df,PART_SIZE)
+			jobs = []
+			print (['Publishing ',PART_SIZE,' PARTITION'])
+			for data in _df :
+				publisher = lambda _params: ( Components() ).post(**_params)
+				_args = {'data':data,'store':_config['store']['target']}
+				_args['store']['table'] = table
+				print (_args['store'])
+				job = Process(target = publisher,args=(_args,))
+				job.name = "Publisher "+str(len(jobs)+1)
+				job.start()
+				jobs.append(job)
+			
+	elif 'shuffle' in SYS_ARGS :
+		index = 0
+		if GPU_CHIPS and 'all-chips' in SYS_ARGS:
+
+			for index in GPU_CHIPS :
+				publisher = lambda _params: ( Components() ).shuffle(_params)
+				job = Process (target = publisher,args=( args,))
+				job.name = 'Shuffler #' + str(index)
+				job.start()
+				jobs.append(job)
+		else:
+			shuffler = Components()
+			shuffler.shuffle(args)
+		pass
+	elif 'train' in SYS_ARGS:
+		
+		# DATA  = np.array_split(DATA,PART_SIZE)
+		#
+		# Let us create n-jobs across n-gpus, The assumption here is the data that is produced will be a partition
+		# @TODO: Find better name for partition
+		#
+		
+		if GPU_CHIPS and 'all-chips' in SYS_ARGS:
+			index = 0
+			print (['... launching ',len(GPU_CHIPS),' jobs',args['context']])
+			for _gpu in GPU_CHIPS :
+				_args = copy.deepcopy(args)
+				_args['gpu'] = [int(_gpu)]
+				_args['partition'] = int(_gpu) #index
+				index += 1
+				make = lambda _params: (Components()).train(**_params)
+				job = Process(target=make,args=( _args,))
+				job.name = 'Trainer # ' + str(index)
+				job.start()
+				jobs.append(job)
+			
+
+
+
+		else:
+			#
+			# The choice of the chip will be made internally
+			
+			agent = Components()
+			agent.train(**args)
+		#
+		# If we have any obs we should wait till they finish
+		#
+	DIRTY = 0
+	if (len(jobs)) :
+		print (['.... waiting on ',len(jobs),' jobs'])
+	while len(jobs)> 0 :
+		DIRTY =1
+		jobs = [job for job in jobs if job.is_alive()]
+		time.sleep(2)
+	if DIRTY:
+		print (["..:: jobs finished "])
+	#
+	# We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations
+	# This holds true for bigquery - bigquery only
+	IS_BIGQUERY = _config['store']['source']['provider'] == _config['store']['target']['provider'] and _config['store']['source']['provider'] == 'bigquery'
+
+	# if 'bind' not in SYS_ARGS and IS_BIGQUERY and ('autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS  or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS)) :
+	# 	#
+	# 	# We should pull all the primary keys and regenerate them in order to insure some form of consistency
+	# 	#
+
+	# 	#
+	# 	#
+		
+	# 	print (["..:: Finalizing process"])
+	# 	(Components()).finalize(args)
--- a/setup.py
+++ b/setup.py
@ -4,9 +4,10 @@ import sys

 def read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read() 
-args = {"name":"data-maker","version":"1.0.5","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
+args = {"name":"data-maker","version":"1.6.4",
+        "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vumc.org","license":"MIT",
        "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
-args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
+args["install_requires"] = ['data-transport@git+https://github.com/lnyemba/data-transport.git','tensorflow']
 args['url'] = 'https://hiplab.mc.vanderbilt.edu/aou/data-maker.git'

 if sys.version_info[0] == 2 :