bug fix: multiple conditions on statement

This commit is contained in:
Steve Nyemba 2020-04-02 00:04:05 -05:00
parent 87d54c508d
commit 6d84b25d95
2 changed files with 38 additions and 52 deletions

View File

@ -536,10 +536,10 @@ class Predict(GNet):
self.values = args['values'] self.values = args['values']
self.ROW_COUNT = args['row_count'] self.ROW_COUNT = args['row_count']
self.oROW_COUNT = self.ROW_COUNT self.oROW_COUNT = self.ROW_COUNT
if args['no_value'] in ['na','','NA'] :
self.MISSING_VALUES = np.nan self.MISSING_VALUES = np.nan
else : if 'no_value' in args and args['no_value'] not in ['na','','NA'] :
self.MISSING_VALUES = args['no_value'] self.MISSING_VALUES = args['no_value']
# self.MISSING_VALUES = args['no_value'] # self.MISSING_VALUES = args['no_value']
# self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value'] # self.MISSING_VALUES = int(args['no_value']) if args['no_value'].isnumeric() else np.na if args['no_value'] in ['na','NA','N/A'] else args['no_value']
def load_meta(self, column): def load_meta(self, column):

View File

@ -20,7 +20,12 @@ class Components :
class KEYS : class KEYS :
PIPELINE_KEY = 'pipeline' PIPELINE_KEY = 'pipeline'
SQL_FILTER = 'filter' SQL_FILTER = 'filter'
@staticmethod
def get_filter (**args):
if args['qualifier'] == 'IN' :
return ' '.join([args['field'],args['qualifier'],'(',args['value'],')'])
else:
return ' '.join([args['field'],args['qualifier'],args['value']])
@staticmethod @staticmethod
def get_logger(**args) : def get_logger(**args) :
return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) return factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
@ -34,8 +39,11 @@ class Components :
""" """
SQL = args['sql'] SQL = args['sql']
if Components.KEYS.SQL_FILTER in args : if Components.KEYS.SQL_FILTER in args :
SQL_FILTER = Components.KEYS.SQL_FILTER FILTER_KEY = Components.KEYS.SQL_FILTER
condition = ' '.join([args[SQL_FILTER]['field'],args[SQL_FILTER]['qualifier'],'(',args[SQL_FILTER]['value'],')']) SQL_FILTER = args[FILTER_KEY] if type(args[FILTER_KEY]) == list else [args[FILTER_KEY]]
# condition = ' '.join([args[FILTER_KEY]['field'],args[FILTER_KEY]['qualifier'],'(',args[FILTER_KEY]['value'],')'])
condition = ' AND '.join([Components.get_filter(**item) for item in SQL_FILTER])
SQL = " ".join([SQL,'WHERE',condition]) SQL = " ".join([SQL,'WHERE',condition])
SQL = SQL.replace(':dataset',args['dataset']) #+ " LI " SQL = SQL.replace(':dataset',args['dataset']) #+ " LI "
@ -76,13 +84,6 @@ class Components :
logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']}) logger = factory.instance(type='mongo.MongoWriter',args={'dbname':'aou','doc':args['context']})
log_folder = args['logs'] if 'logs' in args else 'logs' log_folder = args['logs'] if 'logs' in args else 'logs'
# _args = {"batch_size":10000,"logs":log_folder,"context":args['context'],"max_epochs":150,"column":args['columns'],"id":"person_id","logger":logger}
# _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
# _args['num_gpu'] = int(args['num_gpu']) if 'num_gpu' in args else 1
# _args['gpu'] = args['gpu'] if 'gpu' in args else 0
# # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0
PART_SIZE = int(args['part_size']) PART_SIZE = int(args['part_size'])
partition = args['partition'] partition = args['partition']
@ -156,16 +157,22 @@ class Components :
# columns = args['columns'] # columns = args['columns']
# df = np.array_split(df[columns].values,PART_SIZE) # df = np.array_split(df[columns].values,PART_SIZE)
# df = pd.DataFrame(df[ int (partition) ],columns = columns) # df = pd.DataFrame(df[ int (partition) ],columns = columns)
info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE)} # max_rows = int(args['partition_max_rows']) if 'partition_max_rows' in args else 1000000
# N = np.divide(df.shape[0],max_rows).astype(int) + 1
info = {"parition":int(partition),"gpu":_args["gpu"],"rows":int(df.shape[0]),"cols":int(df.shape[1]),"part_size":int(PART_SIZE),"partition-info":{"count":int(N),"max_rows":max_rows}}
logger.write({"module":"generate","action":"partition","input":info}) logger.write({"module":"generate","action":"partition","input":info})
_args['partition'] = int(partition) _args['partition'] = int(partition)
_args['continuous']= args['continuous'] if 'continuous' in args else [] _args['continuous']= args['continuous'] if 'continuous' in args else []
_args['data'] = df #
# _args['data'] = reader() # How many rows sub-partition must we divide this into ?
#_args['data'] = _args['data'].astype(object) # -- Let us tray assessing
# _args['num_gpu'] = 1
_dc = data.maker.generate(**_args)
df = np.array_split(df,N)
_dc = pd.DataFrame()
# for mdf in df :
_args['data'] = df
_dc = _dc.append(data.maker.generate(**_args))
# #
# We need to post the generate the data in order to : # We need to post the generate the data in order to :
# 1. compare immediately # 1. compare immediately
@ -180,35 +187,13 @@ class Components :
# #
info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}} info = {"module":"generate","action":"io.metrics","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
x = {} x = {}
# for name in args['columns'] : #
# ident = data_comp.apply(lambda row: 1*(row[name]==row[name+'_io']),axis=1).sum() # @TODO: Send data over to a process for analytics
# count = data_comp[name].unique().size
# _ident= data_comp.shape[1] - ident
# _count= data_comp[name+'_io'].unique().size
# _count= len(set(data_comp[name+'_io'].values.tolist()))
# info['input']['logs'] += [{"name":name,"identical":int(ident),"no_identical":int(_ident),"original_count":count,"synthetic_count":_count}]
# for name in data_comp.columns.tolist() :
# g = pd.DataFrame(data_comp.groupby([name]).size())
# g.columns = ['counts']
# g[name] = g.index.tolist()
# g.index = np.arange(g.shape[0])
# logs.append({"name":name,"counts": g.to_dict(orient='records')})
# info['input']['logs'] = logs
# logger.write(info)
base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it) base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it)
cols = _dc.columns.tolist() cols = _dc.columns.tolist()
for name in cols : for name in cols :
_args['data'][name] = _dc[name] _args['data'][name] = _dc[name]
# info = {"module":"generate","action":"io","input":{"rows":_dc[name].shape[0],"name":name}}
# if partition != '' :
# info['partition'] = int(partition)
# logger.write(info)
# filename = os.sep.join([log_folder,'output',name+'.csv'])
# data_comp[[name]].to_csv(filename,index=False)
# #
#-- Let us store all of this into bigquery #-- Let us store all of this into bigquery
@ -265,7 +250,7 @@ if __name__ == '__main__' :
f = [i for i in range(0,N) if PIPELINE[i]['context'] == index] f = [i for i in range(0,N) if PIPELINE[i]['context'] == index]
index = f[0] if f else 0 index = f[0] if f else 0
# #
# print
print ("..::: ",PIPELINE[index]['context']) print ("..::: ",PIPELINE[index]['context'])
args = (PIPELINE[index]) args = (PIPELINE[index])
for key in _config : for key in _config :
@ -274,8 +259,8 @@ if __name__ == '__main__' :
# skip in case of pipeline or if key exists in the selected pipeline (provided by index) # skip in case of pipeline or if key exists in the selected pipeline (provided by index)
# #
continue continue
args[key] = _config[key] args[key] = _config[key]
args = dict(args,**SYS_ARGS) args = dict(args,**SYS_ARGS)
if 'batch_size' not in args : if 'batch_size' not in args :
args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size']) args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size'])
@ -286,7 +271,7 @@ if __name__ == '__main__' :
# @TODO: # @TODO:
# Log what was initiated so we have context of this processing ... # Log what was initiated so we have context of this processing ...
# #
if 'listen' not in SYS_ARGS : # if 'listen' not in SYS_ARGS :
if 'file' in args : if 'file' in args :
DATA = pd.read_csv(args['file']) ; DATA = pd.read_csv(args['file']) ;
else: else:
@ -325,6 +310,7 @@ if __name__ == '__main__' :
args['gpu'] = index args['gpu'] = index
else: else:
args['gpu']=0 args['gpu']=0
make = lambda _args: (Components()).generate(_args) make = lambda _args: (Components()).generate(_args)
job = Process(target=make,args=(args,)) job = Process(target=make,args=(args,))
job.name = 'generator # '+str(index) job.name = 'generator # '+str(index)