bug fixes: enhancements
This commit is contained in:
parent
cad54d7b45
commit
ee0165de01
|
@ -0,0 +1,377 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
This file will perform basic tasks to finalize the GAN process by performing the following :
|
||||
- basic stats & analytics
|
||||
- rebuild io to another dataset
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from multiprocessing import Process, Lock
|
||||
from google.oauth2 import service_account
|
||||
from google.cloud import bigquery as bq
|
||||
import transport
|
||||
from data.params import SYS_ARGS
|
||||
import json
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from google.oauth2 import service_account
|
||||
import json
|
||||
|
||||
# path = '../curation-prod.json'
|
||||
# credentials = service_account.Credentials.from_service_account_file(path)
|
||||
# df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
|
||||
filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
|
||||
f = open(filename)
|
||||
config = json.loads(f.read())
|
||||
args = config['pipeline']
|
||||
f.close()
|
||||
|
||||
def _formatSQL(**_args):
|
||||
"""
|
||||
This function will build the _map for a given segment
|
||||
"""
|
||||
sql = """
|
||||
select DISTINCT x.person_id synthetic,y.person_id original
|
||||
FROM :synthetic.:table x
|
||||
INNER JOIN :original.:table y on x.person_id in (:ids)
|
||||
AND x.person_id <> y.person_id AND x.gender_source_value = y.gender_source_value
|
||||
AND x.year_of_birth = y.year_of_birth
|
||||
ORDER BY 1
|
||||
"""
|
||||
table= _args['table']
|
||||
original,synthetic = _args['schema']['original'],_args['schema']['synthetic']
|
||||
_ids = np.array(_args['ids']).astype(str)
|
||||
return sql.replace(":ids",",".join(_ids)).replace(":synthetic",synthetic).replace(":original",original).replace(":table",table)
|
||||
def _addCounts(**_args) :
|
||||
store = _args['store']
|
||||
sql = _args['sql']
|
||||
reader = transport.factory.instance(**store['source'])
|
||||
_df = reader.read(sql=sql)
|
||||
_ids = _df.synthetic.unique()
|
||||
_counts = [ np.sum(_df.synthetic == value) for value in _ids]
|
||||
original = [_df[_df.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids]
|
||||
_df = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts})
|
||||
|
||||
#
|
||||
# We can post this to the backend ...
|
||||
#
|
||||
table = '_map' #-- Yes this is hard-coded
|
||||
writer = transport.factory.instance(**dict(store['target'],**{"parallel":True,"table":table}))
|
||||
# if writer.has(table=table) is False:
|
||||
# writer.write(_df)
|
||||
# else:
|
||||
_schema = [{"name":name,"type":"INTEGER"} for name in _df.columns]
|
||||
writer.write(_df,schema=_schema)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def Init(**_args) :
|
||||
"""
|
||||
This function will build a map of the synthetic to real individuals.
|
||||
The assumption is that the synthesized data is stored in the same data-store as the original the parameters provided are :
|
||||
:param store object from the configuration file with source,target entries
|
||||
:param table name of the original/synthetic tables (they should be the same)
|
||||
:param feat. featuress/attributes ... demographics to account for
|
||||
"""
|
||||
store = _args['store']
|
||||
reader = transport.factory.instance(**store['source'])
|
||||
original,synthetic = _args['schema']['original'],_args['schema']['synthetic']
|
||||
table = _args['table']
|
||||
sql = _args['sql'].replace(':synthetic',synthetic).replace(':original',original).replace(':table',table)
|
||||
|
||||
_map = reader.read(sql=sql)
|
||||
|
||||
|
||||
|
||||
k = _args['k'] if 'k' in _args else 2
|
||||
# _iodf = reader.read(table=table)
|
||||
# _ids = _iodf['person_id'].unique().tolist()
|
||||
# x_ = np.array_split(_ids,1000)
|
||||
jobs = []
|
||||
# for _items in x_ :
|
||||
# _p = {"ids":_items,"schema":_args['schema'],'store':store,'table':table}
|
||||
# sql = _formatSQL(**_p)
|
||||
# _p['sql'] = sql
|
||||
# _apply = lambda params: _addCounts(**params)
|
||||
# thread = Process(target=_apply,args=(_p,))
|
||||
# thread.start()
|
||||
# jobs.append(thread)
|
||||
|
||||
# return jobs
|
||||
#
|
||||
# We have performed a m:m (many-to-many) relationship with original participants and synthetic participants
|
||||
# The goal is to obtain a singular map against which records will be migrated
|
||||
#
|
||||
print (['... computing counts (k)'])
|
||||
_ids = _map.synthetic.unique()
|
||||
_counts = [ np.sum(_map.synthetic == value) for value in _ids]
|
||||
original = [_map[_map.synthetic == value].iloc[np.random.choice(np.arange(_counts[_ids.tolist().index(value)]),1),:].original.values[0] for value in _ids]
|
||||
print (['Building k-classes/groups'])
|
||||
_mdf = pd.DataFrame({"synthetic":_ids,"original":original,"counts":_counts})
|
||||
i = _mdf.apply(lambda row: row.counts >= k,axis=1)
|
||||
_mdf = _mdf[i]
|
||||
#
|
||||
# Log what just happened here so we know about the equivalence classes,
|
||||
# {"module":"binder","action":"map-generation","input":{"k":k,"rows":{"synthetic":_mdf.shape[0],"original":len(_counts)}}}
|
||||
|
||||
return _mdf
|
||||
#
|
||||
# now we are posting this to target storage ...
|
||||
#
|
||||
def ApplyOn (**_args):
|
||||
"""
|
||||
This function will rewrite SQL that applies the synthetic identifier to the entries of the pipeline
|
||||
We assume that the _map has two attributes (synthetic and original)
|
||||
:param store
|
||||
:param _config
|
||||
"""
|
||||
store_args = _args['store']
|
||||
_config = _args['config']
|
||||
|
||||
table = _config['from']
|
||||
reader = transport.factory.instance(**dict(store_args['source'],**{"table":table}))
|
||||
attr = reader.read(limit=1).columns.tolist()
|
||||
original_key = _args['original_key'] #-- assuming referential integrity
|
||||
|
||||
# synthetic_key= columns['synthetic']
|
||||
# mapped_original=columns['orginal']
|
||||
fields = list(set(attr) - set([original_key]))
|
||||
sql = "select _map.synthetic as :original_key,:fields from :original_schema.:table inner join :synthetic_schema._map on _map.original = :table.:original_key"
|
||||
sql = sql.replace(":table",table).replace(":fields",",".join(fields))
|
||||
sql = sql.replace(":original_key",original_key)
|
||||
_schema = _args['schema']
|
||||
sql = sql.replace(":original_schema",_schema['original']).replace(":synthetic_schema",_schema['synthetic'])
|
||||
|
||||
return reader.read (sql=sql)
|
||||
|
||||
if __name__ == '__main__' :
|
||||
pass
|
||||
|
||||
# class Analytics :
|
||||
# """
|
||||
# This class will compile basic analytics about a given dataset i.e compare original/synthetic
|
||||
# """
|
||||
# @staticmethod
|
||||
# def distribution(**args):
|
||||
# context = args['context']
|
||||
# df = args['data']
|
||||
# #
|
||||
# #-- This data frame counts unique values for each feature (space)
|
||||
# df_counts = pd.DataFrame(df.apply(lambda col: col.unique().size),columns=['counts']).T # unique counts
|
||||
# #
|
||||
# #-- Get the distributions for common values
|
||||
# #
|
||||
# names = [name for name in df_counts.columns.tolist() if name.endswith('_io') == False]
|
||||
# ddf = df.apply(lambda col: pd.DataFrame(col.values,columns=[col.name]).groupby([col.name]).size() ).fillna(0)
|
||||
# ddf[context] = ddf.index
|
||||
|
||||
# pass
|
||||
# def distance(**args):
|
||||
# """
|
||||
# This function will measure the distance between
|
||||
# """
|
||||
# pass
|
||||
# class Utils :
|
||||
# @staticmethod
|
||||
# def log(**args):
|
||||
# logger = transport.factory.instance(type="mongo.MongoWriter",args={"dbname":"aou","doc":"logs"})
|
||||
# logger.write(args)
|
||||
# logger.close()
|
||||
# class get :
|
||||
# @staticmethod
|
||||
# def pipeline(table,path) :
|
||||
# # contexts = args['contexts'].split(',') if type(args['contexts']) == str else args['contexts']
|
||||
# config = json.loads((open(path)).read())
|
||||
# pipeline = config['pipeline']
|
||||
# # return [ item for item in pipeline if item['context'] in contexts]
|
||||
# pipeline = [item for item in pipeline if 'from' in item and item['from'].strip() == table]
|
||||
# Utils.log(module=table,action='init',input={"pipeline":pipeline})
|
||||
# return pipeline
|
||||
# @staticmethod
|
||||
# def sql(**args) :
|
||||
# """
|
||||
# This function is intended to build SQL query for the remainder of the table that was not synthesized
|
||||
# :config configuration entries
|
||||
# :from source of the table name
|
||||
# :dataset name of the source dataset
|
||||
|
||||
# """
|
||||
# SQL = ["SELECT * FROM :from "]
|
||||
# SQL_FILTER = []
|
||||
# NO_FILTERS_FOUND = True
|
||||
# # pipeline = Utils.get.config(**args)
|
||||
# pipeline = args['pipeline']
|
||||
# REVERSE_QUALIFIER = {'IN':'NOT IN','NOT IN':'IN','=':'<>','<>':'='}
|
||||
# for item in pipeline :
|
||||
|
||||
|
||||
# if 'filter' in item :
|
||||
# if NO_FILTERS_FOUND :
|
||||
# NO_FILTERS_FOUND = False
|
||||
# SQL += ['WHERE']
|
||||
# #
|
||||
# # Let us load the filter in the SQL Query
|
||||
# FILTER = item['filter']
|
||||
# QUALIFIER = REVERSE_QUALIFIER[FILTER['qualifier'].upper()]
|
||||
# SQL_FILTER += [" ".join([FILTER['field'], QUALIFIER,'(',FILTER['value'],')']).replace(":dataset",args['dataset'])]
|
||||
# src = ".".join([args['dataset'],args['from']])
|
||||
# SQL += [" AND ".join(SQL_FILTER)]
|
||||
# #
|
||||
# # let's pull the field schemas out of the table definition
|
||||
# #
|
||||
# Utils.log(module=args['from'],action='sql',input={"sql":" ".join(SQL) })
|
||||
# return " ".join(SQL).replace(":from",src)
|
||||
|
||||
|
||||
# def mk(**args) :
|
||||
# dataset = args['dataset']
|
||||
# client = args['client'] if 'client' in args else bq.Client.from_service_account_file(args['private_key'])
|
||||
# #
|
||||
# # let us see if we have a dataset handy here
|
||||
# #
|
||||
# datasets = list(client.list_datasets())
|
||||
# found = [item for item in datasets if item.dataset_id == dataset]
|
||||
|
||||
# if not found :
|
||||
|
||||
# return client.create_dataset(dataset)
|
||||
# return found[0]
|
||||
|
||||
# def move (args):
|
||||
# """
|
||||
# This function will move a table from the synthetic dataset into a designated location
|
||||
# This is the simplest case for finalizing a synthetic data set
|
||||
# :private_key
|
||||
# """
|
||||
# pipeline = Utils.get.pipeline(args['from'],args['config'])
|
||||
# _args = json.loads((open(args['config'])).read())
|
||||
# _args['pipeline'] = pipeline
|
||||
# # del _args['pipeline']
|
||||
# args = dict(args,**_args)
|
||||
# # del args['pipeline']
|
||||
# # private_key = args['private_key']
|
||||
# client = bq.Client.from_service_account_json(args['private_key'])
|
||||
|
||||
# dataset = args['dataset']
|
||||
# if pipeline :
|
||||
# SQL = [ ''.join(["SELECT * FROM io.",item['context'],'_full_io']) for item in pipeline]
|
||||
# SQL += [Utils.get.sql(**args)]
|
||||
# SQL = ('\n UNION ALL \n'.join(SQL).replace(':dataset','io'))
|
||||
# else:
|
||||
# #
|
||||
# # moving a table to a designated location
|
||||
# tablename = args['from']
|
||||
# if 'sql' not in args :
|
||||
# SQL = "SELECT * FROM :dataset.:table"
|
||||
# else:
|
||||
# SQL = args['sql']
|
||||
# SQL = SQL.replace(":dataset",dataset).replace(":table",tablename)
|
||||
# Utils.log(module=args['from'],action='sql',input={'sql':SQL})
|
||||
# #
|
||||
# # At this point we have gathered all the tables in the io folder and we should now see if we need to merge with the remainder from the original table
|
||||
# #
|
||||
|
||||
|
||||
|
||||
# odataset = mk(dataset=dataset+'_io',client=client)
|
||||
# # SQL = "SELECT * FROM io.:context_full_io".replace(':context',context)
|
||||
# config = bq.QueryJobConfig()
|
||||
# config.destination = client.dataset(odataset.dataset_id).table(args['from'])
|
||||
# config.use_query_cache = True
|
||||
# config.allow_large_results = True
|
||||
# config.priority = 'INTERACTIVE'
|
||||
# #
|
||||
# #
|
||||
|
||||
# schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
|
||||
# fields = [" ".join(["CAST (",item.name,"AS",item.field_type.replace("INTEGER","INT64").replace("FLOAT","FLOAT64"),") ",item.name]) for item in schema]
|
||||
# SQL = SQL.replace("*"," , ".join(fields))
|
||||
# # print (SQL)
|
||||
# out = client.query(SQL,location='US',job_config=config)
|
||||
# Utils.log(module=args['from'],action='move',input={'job':out.job_id})
|
||||
# return (out.job_id)
|
||||
|
||||
|
||||
|
||||
|
||||
# import pandas as pd
|
||||
# import numpy as np
|
||||
# from google.oauth2 import service_account
|
||||
# import json
|
||||
|
||||
# # path = '../curation-prod.json'
|
||||
# # credentials = service_account.Credentials.from_service_account_file(path)
|
||||
# # df = pd.read_gbq("SELECT * FROM io.icd10_partial_io",credentials=credentials,dialect='standard')
|
||||
# filename = 'config.json' if 'config' not in SYS_ARGS else SYS_ARGS['config']
|
||||
# f = open(filename)
|
||||
# config = json.loads(f.read())
|
||||
# args = config['pipeline']
|
||||
# f.close()
|
||||
|
||||
|
||||
# if __name__ == '__main__' :
|
||||
# """
|
||||
# Usage :
|
||||
# finalize --<move|stats> --contexts <c1,c2,...c3> --from <table>
|
||||
# """
|
||||
|
||||
# if 'move' in SYS_ARGS :
|
||||
|
||||
# if 'init' in SYS_ARGS :
|
||||
# dep = config['dep'] if 'dep' in config else {}
|
||||
# info = []
|
||||
|
||||
# if 'queries' in dep :
|
||||
# info += dep['queries']
|
||||
# print ('________')
|
||||
# if 'tables' in dep :
|
||||
# info += dep['tables']
|
||||
# args = {}
|
||||
# jobs = []
|
||||
# for item in info :
|
||||
# args = {}
|
||||
# if type(item) == str :
|
||||
# args['from'] = item
|
||||
# name = item
|
||||
# else:
|
||||
# args = item
|
||||
# name = item['from']
|
||||
# args['config'] = SYS_ARGS['config']
|
||||
# # args['pipeline'] = []
|
||||
# job = Process(target=move,args=(args,))
|
||||
# job.name = name
|
||||
# jobs.append(job)
|
||||
# job.start()
|
||||
|
||||
|
||||
# # while len(jobs) > 0 :
|
||||
# # jobs = [job for job in jobs if job.is_alive()]
|
||||
# # time.sleep(1)
|
||||
|
||||
|
||||
# else:
|
||||
# move(SYS_ARGS)
|
||||
# # # table = SYS_ARGS['from']
|
||||
# # # args = dict(config,**{"private_key":"../curation-prod.json"})
|
||||
# # args = dict(args,**SYS_ARGS)
|
||||
# # contexts = [item['context'] for item in config['pipeline'] if item['from'] == SYS_ARGS['from']]
|
||||
# # log = []
|
||||
# # if contexts :
|
||||
# # args['contexts'] = contexts
|
||||
# # log = move(**args)
|
||||
|
||||
# # else:
|
||||
# # tables = args['from'].split(',')
|
||||
# # for name in tables :
|
||||
# # name = name.strip()
|
||||
# # args['from'] = name
|
||||
# # log += [move(**args)]
|
||||
# # print ("\n".join(log))
|
||||
|
||||
|
||||
|
||||
# else:
|
||||
# print ("NOT YET READY !")
|
54
data/gan.py
54
data/gan.py
|
@ -622,7 +622,7 @@ class Predict(GNet):
|
|||
candidates.append(np.array([np.round(row).astype(int) for row in _matrix]))
|
||||
# return candidates[0] if len(candidates) == 1 else candidates
|
||||
|
||||
return candidates
|
||||
return [candidates [0]]
|
||||
|
||||
def _apply(self,**args):
|
||||
# print (self.train_dir)
|
||||
|
@ -768,55 +768,3 @@ class Predict(GNet):
|
|||
# return df.to_dict(orient='list')
|
||||
return _matrix
|
||||
|
||||
|
||||
if __name__ == '__main__' :
|
||||
#
|
||||
# Now we get things done ...
|
||||
column = SYS_ARGS['column']
|
||||
column_id = SYS_ARGS['id'] if 'id' in SYS_ARGS else 'person_id'
|
||||
column_id = column_id.split(',') if ',' in column_id else column_id
|
||||
df = pd.read_csv(SYS_ARGS['raw-data'])
|
||||
LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
|
||||
|
||||
context = SYS_ARGS['raw-data'].split(os.sep)[-1:][0][:-4]
|
||||
if set(['train','learn']) & set(SYS_ARGS.keys()):
|
||||
|
||||
df = pd.read_csv(SYS_ARGS['raw-data'])
|
||||
|
||||
# cols = SYS_ARGS['column']
|
||||
# _map,_df = (Binary()).Export(df)
|
||||
# i = np.arange(_map[column]['start'],_map[column]['end'])
|
||||
max_epochs = np.int32(SYS_ARGS['max_epochs']) if 'max_epochs' in SYS_ARGS else 10
|
||||
# REAL = _df[:,i]
|
||||
REAL = pd.get_dummies(df[column]).astype(np.float32).values
|
||||
LABEL = pd.get_dummies(df[column_id]).astype(np.float32).values
|
||||
trainer = Train(context=context,max_epochs=max_epochs,real=REAL,label=LABEL,column=column,column_id=column_id)
|
||||
trainer.apply()
|
||||
|
||||
|
||||
|
||||
|
||||
#
|
||||
# We should train upon this data
|
||||
#
|
||||
# -- we need to convert the data-frame to binary matrix, given a column
|
||||
#
|
||||
pass
|
||||
elif 'generate' in SYS_ARGS:
|
||||
values = df[column].unique().tolist()
|
||||
values.sort()
|
||||
|
||||
p = Predict(context=context,label=LABEL,values=values,column=column)
|
||||
p.load_meta(column)
|
||||
r = p.apply()
|
||||
# print (df)
|
||||
# print ()
|
||||
df[column] = r[column]
|
||||
# print (df)
|
||||
|
||||
|
||||
else:
|
||||
print (SYS_ARGS.keys())
|
||||
print (__doc__)
|
||||
pass
|
||||
|
||||
|
|
|
@ -96,7 +96,11 @@ def train (**_args):
|
|||
# This
|
||||
|
||||
args['store'] = copy.deepcopy(_args['store']['logs'])
|
||||
args['store']['args']['doc'] = _args['context']
|
||||
if 'args' in _args['store']:
|
||||
args['store']['args']['doc'] = _args['context']
|
||||
else:
|
||||
|
||||
args['store']['doc'] = _args['context']
|
||||
logger = factory.instance(**args['store'])
|
||||
args['logger'] = logger
|
||||
|
||||
|
|
|
@ -39,26 +39,10 @@ class Input :
|
|||
- provide a feature space, and rows (matrix profile)
|
||||
- a data index map
|
||||
"""
|
||||
# def learn(self,**_args):
|
||||
# """
|
||||
# This function is designed to learn about, the data and persist
|
||||
# :param table
|
||||
# :param store
|
||||
# """
|
||||
# table = _args['table']
|
||||
# reader = transport.factory.instance(**_args['store'])
|
||||
# df = reader.read(table=table,limit=1)
|
||||
# self.columns = df.columns.tolist()
|
||||
|
||||
# self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T #,self._columns]
|
||||
# self._metadf.columns = self._columns
|
||||
|
||||
# sql = "SELECT :fields from :table".replace(":table",table)
|
||||
|
||||
|
||||
def __init__(self,**_args):
|
||||
"""
|
||||
:param table
|
||||
:param data
|
||||
:param store data-store parameters/configuration
|
||||
:param sql sql query that pulls a representative sample of the data
|
||||
"""
|
||||
|
@ -70,29 +54,18 @@ class Input :
|
|||
pass
|
||||
else:
|
||||
self._initsql(**_args)
|
||||
#
|
||||
# We need to have a means to map of values,columns and vector positions in order
|
||||
# to perform convert and revert to and from binary
|
||||
#
|
||||
self._map = {} if 'map' not in _args else _args['map']
|
||||
# self._metadf = pd.DataFrame(self.df[self._columns].dtypes.values.astype(str)).T #,self._columns]
|
||||
# self._metadf.columns = self._columns
|
||||
# if 'gpu' in _args and 'GPU' in os.environ:
|
||||
|
||||
# np = cp
|
||||
# index = int(_args['gpu'])
|
||||
# np.cuda.Device(index).use()
|
||||
# print(['..:: GPU ',index])
|
||||
|
||||
def _initsql(self,**_args):
|
||||
"""
|
||||
This function will initialize the class on the basis of a data-store and optionally pre-defined columns to be used to be synthesized
|
||||
:param store data-store configuration
|
||||
:param sql sql query to be applied to the transported data
|
||||
:param columns list of columns to be
|
||||
"""
|
||||
# _store_args = _args['store']
|
||||
# reader = transport.factory.instance(**_store_args)
|
||||
# sql = _args['sql']
|
||||
|
||||
# self.df = reader.read(sql=_args['sql'])
|
||||
|
||||
|
||||
if 'columns' not in _args :
|
||||
self._initcols(data=self.df)
|
||||
|
@ -128,14 +101,6 @@ class Input :
|
|||
:param data data-frame that holds the data
|
||||
:param columns columns that need to be synthesized if any
|
||||
"""
|
||||
#
|
||||
# setting class-level variables to be reused across the class
|
||||
# self.df = _args['data']
|
||||
row_count = self.df.shape[0]
|
||||
# self.columns = self.df.columns
|
||||
# self._metadf = self.df.apply(lambda col: col.unique().size)
|
||||
# _df = pd.DataFrame(self.df.apply(lambda col: col.unique().size )).T
|
||||
# cols = None if 'columns' not in _args else _args['columns']
|
||||
self._initcols(**_args)
|
||||
|
||||
def convert(self,**_args):
|
||||
|
@ -247,16 +212,3 @@ class Input :
|
|||
|
||||
return cols,_matrix
|
||||
|
||||
if __name__ == '__main__' :
|
||||
df = pd.read_csv('../../sample.csv')
|
||||
_input = Input(data=df,columns=['age','race'])
|
||||
_m = _input.convert(column='age')
|
||||
print (_m.shape)
|
||||
print (_input.revert(matrix=_m,column='age'))
|
||||
print (_input._metadf)
|
||||
|
||||
# _args = {"store":{"type":"sql.BQReader","args":{"service_key":"/home/steve/dev/aou/accounts/curation-prod.json"}}}
|
||||
# _args['table'] = 'io.observation'
|
||||
# _i = Input(**_args)
|
||||
# df = pd.read_csv('../../sample.csv')
|
||||
# print (Input.ToBinary(df.age))
|
330
pipeline.py
330
pipeline.py
|
@ -101,11 +101,14 @@ class Components :
|
|||
df = pd.read_csv(args['file'])
|
||||
del args['file']
|
||||
elif 'data' not in args :
|
||||
|
||||
reader = factory.instance(**args['store']['source'])
|
||||
|
||||
|
||||
if 'row_limit' in args :
|
||||
df = reader.read(sql=args['sql'],limit=args['row_limit'])
|
||||
else:
|
||||
df = reader.read(sql=args['sql'])
|
||||
df = reader.read(sql=args['sql'])
|
||||
schema = reader.meta(table=args['from']) if hasattr(reader,'meta') and 'from' in args else None
|
||||
else:
|
||||
df = args['data']
|
||||
|
@ -241,6 +244,7 @@ class Components :
|
|||
df.index = np.arange(df.shape[0])
|
||||
self.post(data=df,schema=schema,store=args['store']['target'])
|
||||
def post(self,**_args) :
|
||||
table = _args['from'] if 'from' in _args else _args['store']['table']
|
||||
_schema = _args['schema'] if 'schema' in _args else None
|
||||
writer = factory.instance(**_args['store'])
|
||||
_df = _args['data']
|
||||
|
@ -251,13 +255,13 @@ class Components :
|
|||
_type = str
|
||||
_value = 0
|
||||
if _item['type'] in ['DATE','TIMESTAMP','DATETIMESTAMP','DATETIME'] :
|
||||
if _item['type'] == 'DATE' :
|
||||
if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
|
||||
#
|
||||
# There is an issue with missing dates that needs to be resolved.
|
||||
# for some reason a missing date/time here will cause the types to turn into timestamp (problem)
|
||||
# The following is a hack to address the issue (alas) assuming 10 digit dates and 'NaT' replaces missing date values (pandas specifications)
|
||||
#
|
||||
_df[name] = _df[name].apply(lambda value: '' if str(value) == 'NaT' else str(value)[:10])
|
||||
_df[name] = _df[name].apply(lambda value: None if str(value) == 'NaT' else (str(value)[:10]) if _item['type'] in ['DATE','DATETIME'] else str(value))
|
||||
#_df[name] = _df[name].dt.date
|
||||
# _df[name] = pd.to_datetime(_df[name].fillna(''),errors='coerce')
|
||||
else:
|
||||
|
@ -274,11 +278,33 @@ class Components :
|
|||
_value = ''
|
||||
_df[name] = _df[name].fillna(_value) #.astype(_type)
|
||||
columns.append(name)
|
||||
print ()
|
||||
print (_df)
|
||||
writer.write(_df.astype(object),schema=_schema,table=args['from'])
|
||||
|
||||
fields = _df.columns.tolist()
|
||||
if not writer.has(table=table) and _args['store']['provider'] != 'bigquery':
|
||||
|
||||
_map = {'STRING':'VARCHAR(256)','INTEGER':'BIGINT'} if 'provider' in _args['store'] and _args['store']['provider'] != 'bigquery' else {}
|
||||
_params = {'map':_map,'table':args['from']}
|
||||
if _schema :
|
||||
_params['schema'] = _schema
|
||||
|
||||
else:
|
||||
_params['fields'] = fields
|
||||
|
||||
writer.make(**_params)
|
||||
|
||||
fields = _df.columns.tolist()
|
||||
_df = _df[fields]
|
||||
# writer.fields = fields
|
||||
if _args['store']['provider'] == 'bigquery' :
|
||||
print (['_______ POSTING ______________ ',table])
|
||||
print (['_______________ ',_df.shape[0],' ___________________'])
|
||||
writer.write(_df.astype(object),schema=_schema,table=table)
|
||||
else:
|
||||
writer.write(_df,table=args['from'])
|
||||
writer.table = table
|
||||
writer.write(_df)
|
||||
# else:
|
||||
# writer.write(_df,table=args['from'])
|
||||
|
||||
|
||||
def finalize(self,args):
|
||||
"""
|
||||
|
@ -288,8 +314,9 @@ class Components :
|
|||
"""
|
||||
reader = factory.instance(**args['store']['source'])
|
||||
logger = factory.instance(**args['store']['logs'])
|
||||
target = args['store']['target']['args']['dataset']
|
||||
source = args['store']['source']['args']['dataset']
|
||||
|
||||
target = args['store']['target']['args']['dataset']
|
||||
source = args['store']['source']['args']['dataset']
|
||||
table = args['from']
|
||||
schema = reader.meta(table=args['from'])
|
||||
#
|
||||
|
@ -327,7 +354,10 @@ class Components :
|
|||
This function will generate data and store it to a given,
|
||||
"""
|
||||
store = args['store']['logs']
|
||||
store['args']['doc'] = args['context']
|
||||
if 'args' in store :
|
||||
store['args']['doc'] = args['context']
|
||||
else:
|
||||
store['doc'] = args['context']
|
||||
logger = factory.instance(**store) #type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
|
||||
|
||||
ostore = args['store']['target']
|
||||
|
@ -348,13 +378,13 @@ class Components :
|
|||
schema = reader.meta(table=args['from'])
|
||||
schema = [{"name":_item.name,"type":_item.field_type} for _item in schema]
|
||||
|
||||
|
||||
# else:
|
||||
# #
|
||||
# # This will account for autopilot mode ...
|
||||
# df = args['data']
|
||||
_cast = {}
|
||||
if schema :
|
||||
|
||||
for _item in schema :
|
||||
dtype = str
|
||||
name = _item['name']
|
||||
|
@ -405,139 +435,72 @@ class Components :
|
|||
logger.write(_info)
|
||||
if args['data'].shape[0] > 0 and args['data'].shape[1] > 0 :
|
||||
candidates = (data.maker.generate(**args))
|
||||
|
||||
else:
|
||||
candidates = [df]
|
||||
if 'sql.BQWriter' in ostore['type'] :
|
||||
#table = ".".join([ostore['['dataset'],args['context']])
|
||||
# writer = factory.instance(**ostore)
|
||||
_columns = None
|
||||
skip_columns = []
|
||||
_schema = schema
|
||||
if schema :
|
||||
cols = [_item['name'] for _item in _schema]
|
||||
else:
|
||||
cols = df.columns
|
||||
for _df in candidates :
|
||||
#
|
||||
# we need to format the fields here to make sure we have something cohesive
|
||||
#
|
||||
|
||||
# if 'sql.BQWriter' in ostore['type'] :
|
||||
_columns = None
|
||||
skip_columns = []
|
||||
_schema = schema
|
||||
if schema :
|
||||
cols = [_item['name'] for _item in _schema]
|
||||
else:
|
||||
cols = df.columns.tolist()
|
||||
_info = {"module":"gan-prep","action":"selection","input":{"candidates":len(candidates),"features":cols}}
|
||||
logger.write(_info)
|
||||
for _df in candidates :
|
||||
#
|
||||
# we need to format the fields here to make sure we have something cohesive
|
||||
#
|
||||
|
||||
if not skip_columns :
|
||||
# _columns = set(df.columns) - set(_df.columns)
|
||||
if 'ignore' in args and 'columns' in args['ignore'] :
|
||||
skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns'])
|
||||
# for name in args['ignore']['columns'] :
|
||||
# for _name in _df.columns:
|
||||
# if _name in name:
|
||||
# skip_columns.append(_name)
|
||||
#
|
||||
# We perform a series of set operations to insure that the following conditions are met:
|
||||
# - the synthetic dataset only has fields that need to be synthesized
|
||||
# - The original dataset has all the fields except those that need to be synthesized
|
||||
#
|
||||
|
||||
_df = _df[list(set(_df.columns) - set(skip_columns))].copy()
|
||||
if x_cols :
|
||||
_approx = {}
|
||||
for _col in x_cols :
|
||||
if real_df[_col].unique().size > 0 :
|
||||
|
||||
if not skip_columns :
|
||||
if 'ignore' in args and 'columns' in args['ignore'] :
|
||||
skip_columns = self.get_ignore(data=_df,columns=args['ignore']['columns'])
|
||||
#
|
||||
# We perform a series of set operations to insure that the following conditions are met:
|
||||
# - the synthetic dataset only has fields that need to be synthesized
|
||||
# - The original dataset has all the fields except those that need to be synthesized
|
||||
#
|
||||
|
||||
_df = _df[list(set(_df.columns) - set(skip_columns))].copy()
|
||||
if x_cols :
|
||||
_approx = {}
|
||||
for _col in x_cols :
|
||||
if real_df[_col].unique().size > 0 :
|
||||
|
||||
|
||||
_df[_col] = self.approximate(real_df[_col].values)
|
||||
_approx[_col] = {
|
||||
"io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)},
|
||||
"real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}
|
||||
}
|
||||
else:
|
||||
_df[_col] = -1
|
||||
logger.write({"module":"gan-generate","action":"approximate","status":_approx})
|
||||
if set(df.columns) & set(_df.columns) :
|
||||
_columns = set(df.columns) - set(_df.columns)
|
||||
df = df[_columns]
|
||||
_df[_col] = self.approximate(real_df[_col].values)
|
||||
_approx[_col] = {
|
||||
"io":{"min":_df[_col].min().astype(float),"max":_df[_col].max().astype(float),"mean":_df[_col].mean().astype(float),"sd":_df[_col].values.std().astype(float),"missing": _df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":_df[_col].where(_df[_col] == 0).dropna().count().astype(float)},
|
||||
"real":{"min":real_df[_col].min().astype(float),"max":real_df[_col].max().astype(float),"mean":real_df[_col].mean().astype(float),"sd":real_df[_col].values.std().astype(float),"missing": real_df[_col].where(_df[_col] == -1).dropna().count().astype(float),"zeros":real_df[_col].where(_df[_col] == 0).dropna().count().astype(float)}
|
||||
}
|
||||
else:
|
||||
_df[_col] = -1
|
||||
logger.write({"module":"gan-generate","action":"approximate","status":_approx})
|
||||
if set(df.columns) & set(_df.columns) :
|
||||
_columns = list(set(df.columns) - set(_df.columns))
|
||||
df = df[_columns]
|
||||
|
||||
#
|
||||
# Let us merge the dataset here and and have a comprehensive dataset
|
||||
#
|
||||
# Let us merge the dataset here and and have a comprehensive dataset
|
||||
|
||||
_df = pd.DataFrame.join(df,_df)
|
||||
|
||||
# if _schema :
|
||||
# for _item in _schema :
|
||||
# if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
|
||||
# _df[_item['name']] = _df[_item['name']].astype(str)
|
||||
|
||||
# pass
|
||||
_params = {'data':_df,'store' : ostore}
|
||||
if _schema :
|
||||
_params ['schema'] = _schema
|
||||
self.post(**_params)
|
||||
# if _schema :
|
||||
# writer.write(_df[cols],schema=_schema,table=args['from'])
|
||||
# self.post(data=_df,schema=)
|
||||
# else:
|
||||
# writer.write(_df[cols],table=args['from'])
|
||||
_df = pd.DataFrame.join(df,_df)
|
||||
_params = {'data':_df,'store' : ostore}
|
||||
if _schema :
|
||||
_params ['schema'] = _schema
|
||||
_info = {"module":"gan-prep","action":"write","input":{"rows":_df.shape[0],"cols":_df.shape[1]}}
|
||||
logger.write(_info)
|
||||
self.post(**_params)
|
||||
# print (['_______ posting _________________',_df.shape])
|
||||
break
|
||||
|
||||
|
||||
pass
|
||||
# else:
|
||||
# pass
|
||||
|
||||
|
||||
# #
|
||||
# # We need to post the generate the data in order to :
|
||||
# # 1. compare immediately
|
||||
# # 2. synthetic copy
|
||||
# #
|
||||
|
||||
# cols = _dc.columns.tolist()
|
||||
|
||||
# data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query)
|
||||
# #
|
||||
# # performing basic analytics on the synthetic data generated (easy to quickly asses)
|
||||
# #
|
||||
# info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
|
||||
|
||||
# #
|
||||
# # @TODO: Send data over to a process for analytics
|
||||
|
||||
# base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it)
|
||||
# cols = _dc.columns.tolist()
|
||||
# for name in cols :
|
||||
# _args['data'][name] = _dc[name]
|
||||
|
||||
# #
|
||||
# #-- Let us store all of this into bigquery
|
||||
# prefix = args['notify']+'.'+_args['context']
|
||||
# partition = str(partition)
|
||||
# table = '_'.join([prefix,partition,'io']).replace('__','_')
|
||||
# folder = os.sep.join([args['logs'],args['context'],partition,'output'])
|
||||
# if 'file' in args :
|
||||
|
||||
# _fname = os.sep.join([folder,table.replace('_io','_full_io.csv')])
|
||||
# _pname = os.sep.join([folder,table])+'.csv'
|
||||
# data_comp.to_csv( _pname,index=False)
|
||||
# _args['data'].to_csv(_fname,index=False)
|
||||
|
||||
# _id = 'path'
|
||||
# else:
|
||||
|
||||
# credentials = service_account.Credentials.from_service_account_file('/home/steve/dev/aou/accounts/curation-prod.json')
|
||||
# _pname = os.sep.join([folder,table+'.csv'])
|
||||
# _fname = table.replace('_io','_full_io')
|
||||
# partial = '.'.join(['io',args['context']+'_partial_io'])
|
||||
# complete= '.'.join(['io',args['context']+'_full_io'])
|
||||
# data_comp.to_csv(_pname,index=False)
|
||||
# if 'dump' in args :
|
||||
# print (_args['data'].head())
|
||||
# else:
|
||||
# Components.lock.acquire()
|
||||
# data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)
|
||||
# _args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
|
||||
# Components.lock.release()
|
||||
# _id = 'dataset'
|
||||
# info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
|
||||
# if partition :
|
||||
# info ['partition'] = int(partition)
|
||||
# logger.write({"module":"generate","action":"write","input":info} )
|
||||
|
||||
def bind(self,**_args):
|
||||
print (_args)
|
||||
|
||||
|
||||
if __name__ == '__main__' :
|
||||
|
@ -611,6 +574,50 @@ if __name__ == '__main__' :
|
|||
|
||||
generator = Components()
|
||||
generator.generate(args)
|
||||
elif 'bind' in SYS_ARGS :
|
||||
import binder
|
||||
_args = _config['_map']
|
||||
_args['store'] = copy.deepcopy(_config['store'])
|
||||
if 'init' in SYS_ARGS :
|
||||
#
|
||||
# Creating and persisting the map ...
|
||||
print (['.... Binding Initialization'])
|
||||
# jobs = binder.Init(**_args)
|
||||
_mapped = binder.Init(**_args)
|
||||
|
||||
|
||||
_schema = [{"name":_name,"type":"INTEGER"} for _name in _mapped.columns.tolist()]
|
||||
publisher = lambda _params: (Components()).post(**_params)
|
||||
_args = {'data':_mapped,'store':_config['store']['target']}
|
||||
_args['store']['table'] = '_map'
|
||||
if _args['store']['provider'] =='bigquery' :
|
||||
_args['schema'] = _schema
|
||||
|
||||
job = Process (target = publisher,args=(_args,))
|
||||
job.start()
|
||||
jobs = [job]
|
||||
else:
|
||||
#
|
||||
# Applying the map of k on a particular dataset
|
||||
#
|
||||
index = int(SYS_ARGS['index'])
|
||||
_args['config'] = _config['pipeline'][index]
|
||||
_args['original_key'] = 'person_id' if 'original_key' in _config else 'person_id'
|
||||
table = _config['pipeline'][index]['from']
|
||||
_df = binder.ApplyOn(**_args)
|
||||
_df = np.array_split(_df,PART_SIZE)
|
||||
jobs = []
|
||||
print (['Publishing ',PART_SIZE,' PARTITION'])
|
||||
for data in _df :
|
||||
publisher = lambda _params: ( Components() ).post(**_params)
|
||||
_args = {'data':data,'store':_config['store']['target']}
|
||||
_args['store']['table'] = table
|
||||
print (_args['store'])
|
||||
job = Process(target = publisher,args=(_args,))
|
||||
job.name = "Publisher "+str(len(jobs)+1)
|
||||
job.start()
|
||||
jobs.append(job)
|
||||
|
||||
elif 'shuffle' in SYS_ARGS :
|
||||
index = 0
|
||||
if GPU_CHIPS and 'all-chips' in SYS_ARGS:
|
||||
|
@ -632,6 +639,7 @@ if __name__ == '__main__' :
|
|||
# Let us create n-jobs across n-gpus, The assumption here is the data that is produced will be a partition
|
||||
# @TODO: Find better name for partition
|
||||
#
|
||||
|
||||
if GPU_CHIPS and 'all-chips' in SYS_ARGS:
|
||||
index = 0
|
||||
print (['... launching ',len(GPU_CHIPS),' jobs',args['context']])
|
||||
|
@ -652,12 +660,15 @@ if __name__ == '__main__' :
|
|||
else:
|
||||
#
|
||||
# The choice of the chip will be made internally
|
||||
|
||||
agent = Components()
|
||||
agent.train(**args)
|
||||
#
|
||||
# If we have any obs we should wait till they finish
|
||||
#
|
||||
DIRTY = 0
|
||||
if (len(jobs)) :
|
||||
print (['.... waiting on ',len(jobs),' jobs'])
|
||||
while len(jobs)> 0 :
|
||||
DIRTY =1
|
||||
jobs = [job for job in jobs if job.is_alive()]
|
||||
|
@ -666,47 +677,16 @@ if __name__ == '__main__' :
|
|||
print (["..:: jobs finished "])
|
||||
#
|
||||
# We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations
|
||||
#
|
||||
|
||||
if 'autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) :
|
||||
#
|
||||
# We should pull all the primary keys and regenerate them in order to insure some form of consistency
|
||||
#
|
||||
print (["..:: Finalizing process"])
|
||||
(Components()).finalize(args)
|
||||
# finalize(args)
|
||||
pass
|
||||
# jobs = []
|
||||
# for index in range(0,PART_SIZE) :
|
||||
# if 'focus' in args and int(args['focus']) != index :
|
||||
# continue
|
||||
# args['part_size'] = PART_SIZE
|
||||
# args['partition'] = index
|
||||
# args['data'] = DATA[index]
|
||||
# if int(args['num_gpu']) > 1 :
|
||||
# args['gpu'] = index
|
||||
# else:
|
||||
# args['gpu']=0
|
||||
# This holds true for bigquery - bigquery only
|
||||
IS_BIGQUERY = _config['store']['source']['provider'] == _config['store']['target']['provider'] and _config['store']['source']['provider'] == 'bigquery'
|
||||
|
||||
# make = lambda _args: (Components()).train(**_args)
|
||||
# job = Process(target=make,args=( dict(args),))
|
||||
# job.name = 'Trainer # ' + str(index)
|
||||
# job.start()
|
||||
# jobs.append(job)
|
||||
# # args['gpu']
|
||||
# print (["Started ",len(jobs),"trainers" if len(jobs)>1 else "trainer" ])
|
||||
# while len(jobs)> 0 :
|
||||
# jobs = [job for job in jobs if job.is_alive()]
|
||||
# time.sleep(2)
|
||||
# if 'bind' not in SYS_ARGS and IS_BIGQUERY and ('autopilot' in SYS_ARGS or 'finalize' in SYS_ARGS or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS)) :
|
||||
# #
|
||||
# # We should pull all the primary keys and regenerate them in order to insure some form of consistency
|
||||
# #
|
||||
|
||||
# trainer = Components()
|
||||
# trainer.train(**args)
|
||||
# #
|
||||
# #
|
||||
|
||||
|
||||
# Components.train(**args)
|
||||
#for args in PIPELINE :
|
||||
#args['dataset'] = 'combined20190510'
|
||||
#process = Process(target=Components.train,args=(args,))
|
||||
#process.name = args['context']
|
||||
#process.start()
|
||||
# Components.train(args)
|
||||
# print (["..:: Finalizing process"])
|
||||
# (Components()).finalize(args)
|
||||
|
|
Loading…
Reference in New Issue