bug fixes: enhancements

2022-03-24 11:38:52 -05:00 · 2022-03-24 11:38:52 -05:00 · ee0165de01
parent cad54d7b45
commit ee0165de01
5 changed files with 543 additions and 282 deletions
--- a/binder.py
+++ b/binder.py
@ -0,0 +1,377 @@
 #!/usr/bin/env python3
 """
 This file will perform basic tasks to finalize the GAN process by performing the following :
    - basic stats & analytics
    - rebuild io to another dataset
 """
 import pandas as pd
 import numpy as np
 from multiprocessing import Process, Lock
 from google.oauth2 import service_account
 from google.cloud import bigquery as bq
 import transport
 from data.params import SYS_ARGS 
 import json
 import pandas as pd
 import numpy as np
 from google.oauth2 import service_account
 import json
 # path = '../curation-prod.json'
 # credentials = service_account.Credentials.from_service_account_file(path)
 # df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
 filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
 f = open(filename)
 config = json.loads(f.read())
 args = config['pipeline']
 f.close()
 def _formatSQL(**_args):
    """
    This function will build the _map for a given segment
    """
    sql = """
    select DISTINCT x.person_id synthetic,y.person_id original 
    FROM :synthetic.:table x 
    INNER JOIN :original.:table y on x.person_id in (:ids)
    AND x.person_id <> y.person_id AND x.gender_source_value = y.gender_source_value 
    AND x.year_of_birth = y.year_of_birth 
    ORDER BY 1
    """
    table= _args['table']
    original,synthetic = _args['schema']['original'],_args['schema']['synthetic']
    _ids = np.array(_args['ids']).astype(str)
    return sql.replace(":ids",",".join(_ids)).replace(":synthetic",synthetic).replace(":original",original).replace(":table",table)
 def _addCounts(**_args) :
    store   = _args['store']
    sql     = _args['sql']
    reader = transport.factory.instance(**store['source'])
    _df = reader.read(sql=sql)
    _ids = _df.synthetic.unique()
    _counts = [ np.sum(_df.synthetic == value) for value in _ids]
    original = [_df[_df.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids]
    _df = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts})
    #
    # We can post this to the backend ...
    #
    table = '_map'  #-- Yes this is hard-coded
    writer = transport.factory.instance(**dict(store['target'],**{"parallel":True,"table":table}))
    # if writer.has(table=table) is False:
    #     writer.write(_df)
    # else:
    _schema = [{"name":name,"type":"INTEGER"} for name in _df.columns]
    writer.write(_df,schema=_schema)
 def Init(**_args) :
    """
    This function will build a map of the synthetic to real individuals. 
    The assumption is that the synthesized data is stored in the same data-store as the original the parameters provided are :
    :param  store       object from the configuration file with source,target entries
    :param  table       name of the original/synthetic tables (they should be the same)
    :param  feat.       featuress/attributes ... demographics to account for
    """
    store = _args['store']
    reader = transport.factory.instance(**store['source'])
    original,synthetic = _args['schema']['original'],_args['schema']['synthetic']
    table = _args['table']
    sql = _args['sql'].replace(':synthetic',synthetic).replace(':original',original).replace(':table',table)
    _map = reader.read(sql=sql)
    k = _args['k'] if 'k' in _args else 2
    # _iodf = reader.read(table=table)
    # _ids = _iodf['person_id'].unique().tolist()
    # x_  = np.array_split(_ids,1000)
    jobs = []
    # for _items in x_ :
    #     _p = {"ids":_items,"schema":_args['schema'],'store':store,'table':table}
    #     sql = _formatSQL(**_p)
    #     _p['sql'] = sql
    #     _apply = lambda params: _addCounts(**params)
    #     thread = Process(target=_apply,args=(_p,))
    #     thread.start()
    #     jobs.append(thread)
    # return jobs
    #
    # We have performed a m:m (many-to-many) relationship with original participants and synthetic participants
    # The goal is to obtain a singular map against which records will be migrated
    #
    print (['... computing counts (k)'])
    _ids = _map.synthetic.unique()
    _counts = [ np.sum(_map.synthetic == value) for value in _ids]
    original = [_map[_map.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids]
    print (['Building k-classes/groups'])
    _mdf = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts})
    i = _mdf.apply(lambda row: row.counts >= k,axis=1)
    _mdf = _mdf[i]
    #
    # Log what just happened here so we know about the equivalence classes, 
    # {"module":"binder","action":"map-generation","input":{"k":k,"rows":{"synthetic":_mdf.shape[0],"original":len(_counts)}}}
    return _mdf
    #
    # now we are posting this to target storage ...
    #
 def ApplyOn (**_args):
    """
    This  function will rewrite SQL that applies the synthetic identifier to the entries of the pipeline
    We assume that the _map has two attributes (synthetic and original)
    :param  store
    :param  _config
    """
    store_args = _args['store']
    _config = _args['config']
    table = _config['from']
    reader  = transport.factory.instance(**dict(store_args['source'],**{"table":table}))
    attr = reader.read(limit=1).columns.tolist()
    original_key = _args['original_key'] #-- assuming referential integrity
    # synthetic_key= columns['synthetic']
    # mapped_original=columns['orginal']
    fields = list(set(attr) - set([original_key]))
    sql = "select _map.synthetic as :original_key,:fields from :original_schema.:table inner join :synthetic_schema._map on _map.original = :table.:original_key"
    sql = sql.replace(":table",table).replace(":fields",",".join(fields))
    sql = sql.replace(":original_key",original_key)
    _schema = _args['schema']
    sql = sql.replace(":original_schema",_schema['original']).replace(":synthetic_schema",_schema['synthetic'])
    return reader.read (sql=sql)
 if __name__ == '__main__' :
    pass
 # class Analytics :
 #     """
 #     This class will compile basic analytics about a given dataset i.e compare original/synthetic
 #     """
 #     @staticmethod
 #     def distribution(**args):
 #         context = args['context']
 #         df = args['data']
 #         #
 #         #-- This data frame counts unique values for each feature (space)
 #         df_counts = pd.DataFrame(df.apply(lambda col: col.unique().size),columns=['counts']).T  # unique counts
 #         #
 #         #-- Get the distributions for common values
 #         #
 #         names   = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False]
 #         ddf     = df.apply(lambda col: pd.DataFrame(col.values,columns=[col.name]).groupby([col.name]).size() ).fillna(0)
 #         ddf[context] = ddf.index
 #         pass
 #     def distance(**args):
 #         """
 #         This function will measure the distance between 
 #         """
 #         pass
 # class Utils :
 #     @staticmethod
 #     def log(**args):
 #         logger = transport.factory.instance(type="mongo.MongoWriter",args={"dbname":"aou","doc":"logs"})        
 #         logger.write(args)
 #         logger.close()
 #     class get :
 #         @staticmethod
 #         def pipeline(table,path) :
 #             # contexts    = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
 #             config = json.loads((open(path)).read())
 #             pipeline    = config['pipeline']
 #             # return [ item for item in pipeline if item['context'] in contexts]
 #             pipeline =  [item for item in pipeline if 'from' in item and item['from'].strip() == table]
 #             Utils.log(module=table,action='init',input={"pipeline":pipeline})
 #             return pipeline
 #         @staticmethod
 #         def sql(**args) :
 #             """
 #             This function is intended to build SQL query for the remainder of the table that was not synthesized
 #             :config configuration entries
 #             :from   source of the table name
 #             :dataset    name of the source dataset
 #             """
 #             SQL = ["SELECT * FROM :from "]
 #             SQL_FILTER = []
 #             NO_FILTERS_FOUND = True
 #             # pipeline = Utils.get.config(**args)
 #             pipeline = args['pipeline']
 #             REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='}
 #             for item in pipeline :
 #                 if 'filter' in item :
 #                     if NO_FILTERS_FOUND :
 #                         NO_FILTERS_FOUND = False
 #                         SQL  += ['WHERE']
 #                     #
 #                     # Let us load the filter in the SQL Query
 #                     FILTER = item['filter']
 #                     QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()]
 #                     SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')']).replace(":dataset",args['dataset'])]
 #             src = ".".join([args['dataset'],args['from']])
 #             SQL += [" AND ".join(SQL_FILTER)]
 #             #
 #             # let's pull the field schemas out of the table definition
 #             #
 #             Utils.log(module=args['from'],action='sql',input={"sql":" ".join(SQL) })
 #             return " ".join(SQL).replace(":from",src)
 # def mk(**args) :
 #     dataset  = args['dataset']
 #     client  = args['client'] if 'client' in args else bq.Client.from_service_account_file(args['private_key'])
 #     #
 #     # let us see if we have a dataset handy here 
 #     #
 #     datasets = list(client.list_datasets())
 #     found = [item for item in datasets if item.dataset_id == dataset]
 #     if not found :
 #         return client.create_dataset(dataset)
 #     return found[0] 
 # def move (args):
 #     """
 #     This function will move a table from the synthetic dataset into a designated location
 #     This is the simplest case for finalizing a synthetic data set
 #     :private_key        
 #     """
 #     pipeline   = Utils.get.pipeline(args['from'],args['config'])
 #     _args = json.loads((open(args['config'])).read())
 #     _args['pipeline'] = pipeline
 #     # del _args['pipeline']
 #     args = dict(args,**_args)
 #     # del args['pipeline']
 #     # private_key = args['private_key']
 #     client      = bq.Client.from_service_account_json(args['private_key'])
 #     dataset     = args['dataset']
 #     if pipeline :
 #         SQL         = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in pipeline]
 #         SQL         += [Utils.get.sql(**args)]
 #         SQL         =  ('\n UNION ALL \n'.join(SQL).replace(':dataset','io'))
 #     else:
 #         #
 #         # moving a table to a designated location
 #         tablename = args['from']
 #         if 'sql' not in args :
 #             SQL = "SELECT * FROM :dataset.:table"
 #         else:
 #             SQL = args['sql']
 #         SQL = SQL.replace(":dataset",dataset).replace(":table",tablename)
 #     Utils.log(module=args['from'],action='sql',input={'sql':SQL})
 #     #
 #     # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
 #     #
 #     odataset    = mk(dataset=dataset+'_io',client=client)
 #     # SQL =       "SELECT * FROM io.:context_full_io".replace(':context',context)
 #     config = bq.QueryJobConfig()
 #     config.destination = client.dataset(odataset.dataset_id).table(args['from'])
 #     config.use_query_cache = True
 #     config.allow_large_results = True
 #     config.priority = 'INTERACTIVE'
 #     #
 #     #
 #     schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
 #     fields = [" ".join(["CAST (",item.name,"AS",item.field_type.replace("INTEGER","INT64").replace("FLOAT","FLOAT64"),") ",item.name]) for item in schema]
 #     SQL = SQL.replace("*"," , ".join(fields))
 #     # print (SQL)
 #     out = client.query(SQL,location='US',job_config=config)
 #     Utils.log(module=args['from'],action='move',input={'job':out.job_id})
 #     return (out.job_id)
 # import pandas as pd
 # import numpy as np
 # from google.oauth2 import service_account
 # import json
 # # path = '../curation-prod.json'
 # # credentials = service_account.Credentials.from_service_account_file(path)
 # # df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
 # filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
 # f = open(filename)
 # config = json.loads(f.read())
 # args = config['pipeline']
 # f.close()
 # if __name__ == '__main__' :
 #     """
 #     Usage :
 #         finalize --<move|stats> --contexts <c1,c2,...c3> --from <table>
 #     """
 #     if 'move' in SYS_ARGS :
 #         if 'init' in SYS_ARGS :
 #             dep = config['dep'] if 'dep' in config else {}
 #             info = []
 #             if 'queries' in dep :
 #                 info += dep['queries']
 #                 print ('________')
 #             if 'tables' in dep :
 #                 info += dep['tables']
 #             args = {}
 #             jobs = []
 #             for item in info :
 #                 args = {}
 #                 if type(item) == str :
 #                     args['from'] = item
 #                     name = item
 #                 else:
 #                     args = item
 #                     name = item['from']
 #                 args['config'] = SYS_ARGS['config']
 #                 # args['pipeline'] = []
 #                 job = Process(target=move,args=(args,))
 #                 job.name = name
 #                 jobs.append(job)
 #                 job.start()
 #             # while len(jobs) > 0 :
 #             #     jobs = [job for job in jobs if job.is_alive()]
 #             #     time.sleep(1)
 #         else:
 #             move(SYS_ARGS)
 #         # # table = SYS_ARGS['from']
 #         # # args = dict(config,**{"private_key":"../curation-prod.json"})
 #         # args = dict(args,**SYS_ARGS)        
 #         # contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
 #         # log = []
 #         # if contexts :
 #         #     args['contexts'] = contexts
 #         #     log = move(**args)
 #         # else:
 #         #     tables = args['from'].split(',')
 #         #     for name in tables :
 #         #         name = name.strip()
 #         #         args['from'] = name
 #         #         log += [move(**args)]
 #         # print ("\n".join(log))
 #     else:
 #         print ("NOT YET READY !")
--- a/data/gan.py
+++ b/data/gan.py
@ -622,7 +622,7 @@ class Predict(GNet):
                                candidates.append(np.array([np.round(row).astype(int) for row in _matrix]))                       
                # return candidates[0] if len(candidates) == 1 else candidates
-                return candidates 
+                return [candidates [0]]
        def _apply(self,**args):
                # print (self.train_dir)
@ -768,55 +768,3 @@ class Predict(GNet):
                # return df.to_dict(orient='list')
                return _matrix
 if __name__ == '__main__' :
        #
        # Now we get things done ...
        column          = SYS_ARGS['column']
        column_id       = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id'
        column_id       = column_id.split(',') if ',' in column_id else column_id
        df = pd.read_csv(SYS_ARGS['raw-data'])  
        LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
        context         = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4]
        if set(['train','learn']) & set(SYS_ARGS.keys()):
                df = pd.read_csv(SYS_ARGS['raw-data'])   
                # cols = SYS_ARGS['column']
                # _map,_df = (Binary()).Export(df)
                # i = np.arange(_map[column]['start'],_map[column]['end'])
                max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10
                # REAL    = _df[:,i]
                REAL    = pd.get_dummies(df[column]).astype(np.float32).values
                LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
                trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id)
                trainer.apply()
                #
                # We should train upon this data
                #
                # -- we need to convert the data-frame to binary matrix, given a column
                #
                pass
        elif 'generate' in SYS_ARGS:
                values = df[column].unique().tolist()
                values.sort()
                p = Predict(context=context,label=LABEL,values=values,column=column)
                p.load_meta(column)
                r = p.apply()
                # print (df)
                # print ()
                df[column] = r[column]
                # print (df)
        else:
                print (SYS_ARGS.keys())
                print (__doc__)
        pass
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -96,7 +96,11 @@ def train (**_args):
        # This 
        args['store'] = copy.deepcopy(_args['store']['logs'])
-        args['store']['args']['doc'] = _args['context']
+        if 'args' in _args['store']:
            args['store']['args']['doc'] = _args['context']
        else:
            args['store']['doc'] = _args['context']
        logger = factory.instance(**args['store'])
        args['logger'] = logger
--- a/data/maker/prepare/init.py
+++ b/data/maker/prepare/init.py
@ -39,26 +39,10 @@ class Input :
        - provide a feature space, and rows (matrix profile)
        - a data index map
    """
    # def learn(self,**_args):
    #     """
    #     This function is designed to learn about, the data and persist
    #     :param table
    #     :param store
    #     """
    #     table = _args['table']
    #     reader  = transport.factory.instance(**_args['store'])
    #     df = reader.read(table=table,limit=1)
    #     self.columns = df.columns.tolist()
    #     self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T  #,self._columns]
    #     self._metadf.columns = self._columns
    #     sql = "SELECT :fields from :table".replace(":table",table)        
    def __init__(self,**_args):
        """
-        :param table    
+        :param data    
        :param store    data-store parameters/configuration
        :param sql      sql query  that pulls a representative sample of the data
        """
@ -70,29 +54,18 @@ class Input :
            pass  
        else:
            self._initsql(**_args)
        #
        # We need to have a means to map of values,columns and vector positions in order
        # to perform convert and revert to and from binary
        #
        self._map = {} if 'map' not in _args else _args['map']
        # self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T  #,self._columns]
        # self._metadf.columns = self._columns
        # if 'gpu' in _args and 'GPU' in os.environ:
        #     np = cp
        #     index = int(_args['gpu'])
        #     np.cuda.Device(index).use()            
        #     print(['..:: GPU ',index])
    def _initsql(self,**_args):
        """
        This function will initialize the class on the basis of a data-store and optionally pre-defined columns to be used to be synthesized 
        :param store        data-store configuration
        :param sql          sql query to be applied to the transported data
        :param columns      list of columns to be 
        """
        # _store_args = _args['store']
        # reader = transport.factory.instance(**_store_args)
        # sql = _args['sql']
        # self.df = reader.read(sql=_args['sql'])
        if 'columns' not in _args :
            self._initcols(data=self.df)
@ -128,14 +101,6 @@ class Input :
        :param data       data-frame that holds the data
        :param columns columns that need to be synthesized if any
        """
        #
        # setting class-level variables to be reused across the class
        # self.df = _args['data']  
        row_count = self.df.shape[0]
        # self.columns = self.df.columns 
        # self._metadf = self.df.apply(lambda col: col.unique().size)
        # _df = pd.DataFrame(self.df.apply(lambda col: col.unique().size )).T
        # cols = None if 'columns' not in _args else _args['columns']
        self._initcols(**_args)
    def convert(self,**_args):
@ -247,16 +212,3 @@ class Input :
        return cols,_matrix
 if __name__ == '__main__' :
    df = pd.read_csv('../../sample.csv')
    _input = Input(data=df,columns=['age','race'])
    _m = _input.convert(column='age')
    print (_m.shape)
    print (_input.revert(matrix=_m,column='age'))
    print (_input._metadf)
 # _args = {"store":{"type":"sql.BQReader","args":{"service_key":"/home/steve/dev/aou/accounts/curation-prod.json"}}}
 # _args['table'] = 'io.observation'
 # _i = Input(**_args)
 # df = pd.read_csv('../../sample.csv')
 # print (Input.ToBinary(df.age))
--- a/pipeline.py
+++ b/pipeline.py
@ -101,11 +101,14 @@ class Components :
 			df = pd.read_csv(args['file'])
 			del args['file']
 		elif 'data' not in args :
 			reader = factory.instance(**args['store']['source'])
 			if 'row_limit' in args :
 				df = reader.read(sql=args['sql'],limit=args['row_limit'])
 			else:
-				df = reader.read(sql=args['sql'])
+				df = reader.read(sql=args['sql'])		
 			schema = reader.meta(table=args['from']) if hasattr(reader,'meta') and 'from' in args else None
 		else:
 			df = args['data']
@ -241,6 +244,7 @@ class Components :
 		df.index = np.arange(df.shape[0])		
 		self.post(data=df,schema=schema,store=args['store']['target'])	
 	def post(self,**_args) :
 		table = _args['from'] if 'from' in _args else _args['store']['table']
 		_schema = _args['schema'] if 'schema' in _args else None
 		writer 	= factory.instance(**_args['store'])
 		_df 	= _args['data']
@ -251,13 +255,13 @@ class Components :
 				_type = str
 				_value = 0
 				if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] :
-					if _item['type'] == 'DATE' :
+					if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
 						#
 						# There is an issue with missing dates that needs to be resolved.
 						# for some reason a missing date/time here will cause the types to turn into timestamp (problem)
 						#	The following is a hack to address the issue (alas) assuming 10 digit dates and 'NaT' replaces missing date values (pandas specifications)
 						#
-						_df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10])
+						_df[name] = _df[name].apply(lambda value: None if str(value) == 'NaT' else (str(value)[:10]) if _item['type'] in ['DATE','DATETIME'] else str(value))
 						#_df[name] = _df[name].dt.date
 						# _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce')
 					else:
@ -274,11 +278,33 @@ class Components :
 						_value = ''
 					_df[name] = _df[name].fillna(_value) #.astype(_type)
 				columns.append(name)
-			print ()
+		
-			print (_df)
+		fields = _df.columns.tolist()
-			writer.write(_df.astype(object),schema=_schema,table=args['from'])
+		if not writer.has(table=table) and _args['store']['provider'] != 'bigquery':
 			_map = {'STRING':'VARCHAR(256)','INTEGER':'BIGINT'} if 'provider' in _args['store'] and _args['store']['provider'] != 'bigquery' else {}
 			_params = {'map':_map,'table':args['from']}
 			if _schema :
 				_params['schema'] = _schema
 			else:
 				_params['fields'] = fields
 			writer.make(**_params)
 		fields = _df.columns.tolist()
 		_df = _df[fields]
 		# writer.fields = fields
 		if _args['store']['provider'] == 'bigquery' :
 			print (['_______ POSTING ______________ ',table])
 			print (['_______________ ',_df.shape[0],' ___________________'])
 			writer.write(_df.astype(object),schema=_schema,table=table)
 		else:
-			writer.write(_df,table=args['from'])
+			writer.table = table	
 			writer.write(_df)
 		# else:
 		# 	writer.write(_df,table=args['from'])
 	def finalize(self,args):
 		"""
@ -288,8 +314,9 @@ class Components :
 		"""
 		reader = factory.instance(**args['store']['source'])
 		logger = factory.instance(**args['store']['logs'])
-		target = args['store']['target']['args']['dataset']
+		
-		source = args['store']['source']['args']['dataset']
+		target = args['store']['target']['args']['dataset'] 
 		source = args['store']['source']['args']['dataset'] 
 		table = args['from']
 		schema = reader.meta(table=args['from'])
 		#
@ -327,7 +354,10 @@ class Components :
 		This function will generate data and store it to a given,
 		"""
 		store = args['store']['logs']
-		store['args']['doc'] = args['context']
+		if 'args' in store :
 			store['args']['doc'] = args['context']
 		else:
 			store['doc'] = args['context']
 		logger = factory.instance(**store) #type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
 		ostore = args['store']['target']
@ -348,13 +378,13 @@ class Components :
 				schema = reader.meta(table=args['from'])
 				schema = [{"name":_item.name,"type":_item.field_type} for _item in schema]
 			# else:
 			# 	#
 			# 	# This will account for autopilot mode ...
 			# 	df = args['data']
 		_cast = {}
 		if schema :
 			for _item in schema :
 				dtype = str
 				name = _item['name']
@ -405,139 +435,72 @@ class Components :
 		logger.write(_info)
 		if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 :
 			candidates = (data.maker.generate(**args))
 		else:
 			candidates = [df]
-		if 'sql.BQWriter' in ostore['type'] :
+		
-			#table = ".".join([ostore['['dataset'],args['context']])
+		# if 'sql.BQWriter' in ostore['type'] :
-			# writer  = factory.instance(**ostore)
+		_columns = None
-			_columns = None
+		skip_columns = []
-			skip_columns = []
+		_schema = schema
-			_schema = schema
+		if schema :
-			if schema :
+			cols = [_item['name'] for _item in _schema]
-				cols = [_item['name'] for _item in _schema]
+		else:
-			else:
+			cols = df.columns.tolist()
-				cols = df.columns
+		_info = {"module":"gan-prep","action":"selection","input":{"candidates":len(candidates),"features":cols}}
-			for _df in candidates :
+		logger.write(_info)
-				#
+		for _df in candidates :
-				# we need to format the fields here to make sure we have something cohesive
+			#
-				#
+			# we need to format the fields here to make sure we have something cohesive
 			#
-				if not skip_columns :
+			if not skip_columns :
-					# _columns = set(df.columns) - set(_df.columns)
+				if 'ignore' in args and 'columns' in args['ignore'] :
-					if 'ignore' in args and 'columns' in args['ignore'] :
+						skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns'])
-							skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns'])
+			#
-							# for name in args['ignore']['columns'] :
+			# We perform a series of set operations to insure that the following conditions are met:
-							# 	for _name in _df.columns:
+			#	- the synthetic dataset only has fields that need to be synthesized
-							# 		if _name in name:
+			#	- The original dataset has all the fields except those that need to be synthesized
-							# 			skip_columns.append(_name)
+			#
-				#
+			
-				# We perform a series of set operations to insure that the following conditions are met:
+			_df = _df[list(set(_df.columns)  - set(skip_columns))].copy()
-				#	- the synthetic dataset only has fields that need to be synthesized
+			if x_cols :
-				#	- The original dataset has all the fields except those that need to be synthesized
+				_approx = {}
-				#
+				for _col in x_cols :
-				
+					if real_df[_col].unique().size > 0 :
-				_df = _df[list(set(_df.columns)  - set(skip_columns))].copy()
+						
 				if x_cols :
 					_approx = {}
 					for _col in x_cols :
 						if real_df[_col].unique().size > 0 :
-							_df[_col] = self.approximate(real_df[_col].values)
+						_df[_col] = self.approximate(real_df[_col].values)
-							_approx[_col] = {
+						_approx[_col] = {
-								"io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)},
+							"io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)},
-								"real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}
+							"real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}
-							}
+						}
-						else:
+					else:
-							_df[_col] = -1
+						_df[_col] = -1
-					logger.write({"module":"gan-generate","action":"approximate","status":_approx})				
+				logger.write({"module":"gan-generate","action":"approximate","status":_approx})				
-				if set(df.columns) & set(_df.columns) :
+			if set(df.columns) & set(_df.columns) :
-					_columns = set(df.columns) - set(_df.columns)											
+				_columns = list(set(df.columns) - set(_df.columns))											
-					df = df[_columns]
+				df = df[_columns]
-				#
+			#
-				# Let us merge the dataset here and and have a comprehensive dataset
+			# Let us merge the dataset here and and have a comprehensive dataset
-				_df = pd.DataFrame.join(df,_df)
+			_df = pd.DataFrame.join(df,_df)
-				
+			_params = {'data':_df,'store' : ostore}
-				# if _schema :
+			if _schema :
-				# 	for _item in _schema :
+				_params ['schema'] = _schema
-				# 		if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
+			_info = {"module":"gan-prep","action":"write","input":{"rows":_df.shape[0],"cols":_df.shape[1]}}
-				# 			_df[_item['name']] = _df[_item['name']].astype(str)
+			logger.write(_info)
-				
+			self.post(**_params)
-				# 		pass
+			# print (['_______ posting _________________',_df.shape])
-				_params = {'data':_df,'store' : ostore}
+			break
-				if _schema :
+			
 					_params ['schema'] = _schema
 				self.post(**_params)
 				# if _schema :
 				# 	writer.write(_df[cols],schema=_schema,table=args['from'])
 				# 	self.post(data=_df,schema=)
 				# else:
 				# 	writer.write(_df[cols],table=args['from'])
 			pass
 		# else:
 		# 	pass
-
+	def bind(self,**_args):
-		
+		print (_args)
 		# #
 		# # We need to post the generate the data in order to :
 		# #	1. compare immediately
 		# #	2. synthetic copy
 		# #
 		# cols = _dc.columns.tolist()
 		# data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io')				#-- will be used for comparison (store this in big query)
 		# #
 		# # performing basic analytics on the synthetic data generated (easy to quickly asses)
 		# #
 		# info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
 		# #
 		# # @TODO: Send data over to a process for analytics
 		# base_cols = list(set(_args['data'].columns) - set(args['columns']))	#-- rebuilt the dataset (and store it)
 		# cols = _dc.columns.tolist()
 		# for name in cols :
 		# 	_args['data'][name] = _dc[name]
 		# #
 		# #-- Let us store all of this into bigquery
 		# prefix = args['notify']+'.'+_args['context']
 		# partition = str(partition)
 		# table = '_'.join([prefix,partition,'io']).replace('__','_')
 		# folder = os.sep.join([args['logs'],args['context'],partition,'output']) 
 		# if 'file' in args :
 		# 	_fname = os.sep.join([folder,table.replace('_io','_full_io.csv')])
 		# 	_pname = os.sep.join([folder,table])+'.csv'
 		# 	data_comp.to_csv( _pname,index=False)
 		# 	_args['data'].to_csv(_fname,index=False)
 		# 	_id = 'path'
 		# else:
 		# 	credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
 		# 	_pname = os.sep.join([folder,table+'.csv'])
 		# 	_fname = table.replace('_io','_full_io')
 		# 	partial = '.'.join(['io',args['context']+'_partial_io'])
 		# 	complete= '.'.join(['io',args['context']+'_full_io'])
 		# 	data_comp.to_csv(_pname,index=False)
 		# 	if 'dump' in args :
 		# 		print (_args['data'].head())
 		# 	else:
 		# 		Components.lock.acquire()
 		# 		data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)	
 		# 		_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
 		# 		Components.lock.release()
 		# 	_id = 'dataset'
 		# info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
 		# if partition :
 		# 	info ['partition'] = int(partition)
 		# logger.write({"module":"generate","action":"write","input":info} )
 if __name__ == '__main__' :
@ -611,6 +574,50 @@ if __name__ == '__main__' :
 			generator = Components()
 			generator.generate(args)
 	elif 'bind' in SYS_ARGS :
 		import binder
 		_args = _config['_map']
 		_args['store'] = copy.deepcopy(_config['store'])
 		if 'init' in SYS_ARGS :
 			#
 			# Creating and persisting the map ...
 			print (['.... Binding Initialization'])
 			# jobs = binder.Init(**_args)
 			_mapped = binder.Init(**_args)
 			_schema = [{"name":_name,"type":"INTEGER"} for _name in _mapped.columns.tolist()]
 			publisher = lambda _params: (Components()).post(**_params)
 			_args = {'data':_mapped,'store':_config['store']['target']}
 			_args['store']['table'] = '_map'
 			if _args['store']['provider'] =='bigquery' :
 				_args['schema'] = _schema
 			job = Process (target = publisher,args=(_args,))
 			job.start()
 			jobs = [job]
 		else:
 			#
 			# Applying the map of k on a particular dataset
 			#
 			index = int(SYS_ARGS['index'])
 			_args['config'] = _config['pipeline'][index]
 			_args['original_key'] = 'person_id' if 'original_key' in _config else 'person_id'
 			table = _config['pipeline'][index]['from']
 			_df =  binder.ApplyOn(**_args)
 			_df = np.array_split(_df,PART_SIZE)
 			jobs = []
 			print (['Publishing ',PART_SIZE,' PARTITION'])
 			for data in _df :
 				publisher = lambda _params: ( Components() ).post(**_params)
 				_args = {'data':data,'store':_config['store']['target']}
 				_args['store']['table'] = table
 				print (_args['store'])
 				job = Process(target = publisher,args=(_args,))
 				job.name = "Publisher "+str(len(jobs)+1)
 				job.start()
 				jobs.append(job)
 	elif 'shuffle' in SYS_ARGS :
 		index = 0
 		if GPU_CHIPS and 'all-chips' in SYS_ARGS:
@ -632,6 +639,7 @@ if __name__ == '__main__' :
 		# Let us create n-jobs across n-gpus, The assumption here is the data that is produced will be a partition
 		# @TODO: Find better name for partition
 		#
 		if GPU_CHIPS and 'all-chips' in SYS_ARGS:
 			index = 0
 			print (['... launching ',len(GPU_CHIPS),' jobs',args['context']])
@ -652,12 +660,15 @@ if __name__ == '__main__' :
 		else:
 			#
 			# The choice of the chip will be made internally
 			agent = Components()
 			agent.train(**args)
 		#
 		# If we have any obs we should wait till they finish
 		#
 	DIRTY = 0
 	if (len(jobs)) :
 		print (['.... waiting on ',len(jobs),' jobs'])
 	while len(jobs)> 0 :
 		DIRTY =1
 		jobs = [job for job in jobs if job.is_alive()]
@ -666,47 +677,16 @@ if __name__ == '__main__' :
 		print (["..:: jobs finished "])
 	#
 	# We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations
-	#
+	# This holds true for bigquery - bigquery only
-	
+	IS_BIGQUERY = _config['store']['source']['provider'] == _config['store']['target']['provider'] and _config['store']['source']['provider'] == 'bigquery'
 	if 'autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS  or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) :
 		#
 		# We should pull all the primary keys and regenerate them in order to insure some form of consistency
 		#
 		print (["..:: Finalizing process"])
 		(Components()).finalize(args)
 		# finalize(args)
 		pass
 		# jobs = []
 		# for index in range(0,PART_SIZE) :
 		# 	if 'focus' in args and int(args['focus']) != index :
 		# 		continue
 		# 	args['part_size'] = PART_SIZE
 		# 	args['partition'] = index
 		# 	args['data'] = DATA[index]
 		# 	if int(args['num_gpu']) > 1 :
 		# 		args['gpu'] = index
 		# 	else:
 		# 		args['gpu']=0
-		# 	make = lambda _args: (Components()).train(**_args)
+	# if 'bind' not in SYS_ARGS and IS_BIGQUERY and ('autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS  or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS)) :
-		# 	job = Process(target=make,args=( dict(args),))
+	# 	#
-		# 	job.name = 'Trainer # ' + str(index)
+	# 	# We should pull all the primary keys and regenerate them in order to insure some form of consistency
-		# 	job.start()
+	# 	#
 		# 	jobs.append(job)
 		# 	# args['gpu']
 		# print (["Started ",len(jobs),"trainers" if len(jobs)>1 else "trainer" ])
 		# while len(jobs)> 0 :
 		# 	jobs = [job for job in jobs if job.is_alive()]
 		# 	time.sleep(2)
-		# trainer = Components()
+	# 	#
-		# trainer.train(**args)
+	# 	#
-		
+	# 	print (["..:: Finalizing process"])
-		# Components.train(**args)
+	# 	(Components()).finalize(args)
 #for args in PIPELINE :
 	#args['dataset'] = 'combined20190510'
 	#process = Process(target=Components.train,args=(args,))
 	#process.name = args['context']
 	#process.start()
 #	Components.train(args)