bug fix: statistics for quick assessment
This commit is contained in:
parent
af6ab356d8
commit
2f6f43c9c6
20
pipeline.py
20
pipeline.py
|
@ -163,6 +163,21 @@ class Components :
|
||||||
cols = _dc.columns.tolist()
|
cols = _dc.columns.tolist()
|
||||||
|
|
||||||
data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query)
|
data_comp = _args['data'][args['columns']].join(_dc[args['columns']],rsuffix='_io') #-- will be used for comparison (store this in big query)
|
||||||
|
#
|
||||||
|
# performing basic analytics on the synthetic data generated (easy to quickly asses)
|
||||||
|
#
|
||||||
|
info = {"module":"generate","action":"io-stats","input":{"rows":data_comp.shape[0],"partition":partition,"logs":[]}}
|
||||||
|
logs = []
|
||||||
|
for name in data_comp.columns.tolist() :
|
||||||
|
g = pd.DataFrame(data_comp.groupby([name]).size())
|
||||||
|
g.columns = ['counts']
|
||||||
|
g[name] = g.index.tolist()
|
||||||
|
g.index = np.arange(g.shape[0])
|
||||||
|
logs.append({"name":name,"counts": g.to_dict(orient='records')})
|
||||||
|
info['input']['logs'] = logs
|
||||||
|
logger.write(info)
|
||||||
|
|
||||||
|
|
||||||
base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it)
|
base_cols = list(set(_args['data'].columns) - set(args['columns'])) #-- rebuilt the dataset (and store it)
|
||||||
for name in cols :
|
for name in cols :
|
||||||
_args['data'][name] = _dc[name]
|
_args['data'][name] = _dc[name]
|
||||||
|
@ -170,6 +185,7 @@ class Components :
|
||||||
if partition != '' :
|
if partition != '' :
|
||||||
info['partition'] = int(partition)
|
info['partition'] = int(partition)
|
||||||
logger.write(info)
|
logger.write(info)
|
||||||
|
|
||||||
# filename = os.sep.join([log_folder,'output',name+'.csv'])
|
# filename = os.sep.join([log_folder,'output',name+'.csv'])
|
||||||
# data_comp[[name]].to_csv(filename,index=False)
|
# data_comp[[name]].to_csv(filename,index=False)
|
||||||
|
|
||||||
|
@ -197,10 +213,10 @@ class Components :
|
||||||
if 'dump' in args :
|
if 'dump' in args :
|
||||||
print (_args['data'].head())
|
print (_args['data'].head())
|
||||||
else:
|
else:
|
||||||
data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=50000)
|
data_comp.to_gbq(if_exists='append',destination_table=partial,credentials=credentials,chunksize=90000)
|
||||||
|
|
||||||
INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'
|
INSERT_FLAG = 'replace' if 'partition' not in args or 'segment' not in args else 'append'
|
||||||
_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=50000)
|
_args['data'].to_gbq(if_exists='append',destination_table=complete,credentials=credentials,chunksize=90000)
|
||||||
_id = 'dataset'
|
_id = 'dataset'
|
||||||
info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
|
info = {"full":{_id:_fname,"rows":_args['data'].shape[0]},"partial":{"path":_pname,"rows":data_comp.shape[0]} }
|
||||||
if partition :
|
if partition :
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -4,7 +4,7 @@ import sys
|
||||||
|
|
||||||
def read(fname):
|
def read(fname):
|
||||||
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
||||||
args = {"name":"data-maker","version":"1.2.3","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
|
args = {"name":"data-maker","version":"1.2.4","author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
|
||||||
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
|
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
|
||||||
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
|
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
|
||||||
args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
|
args['url'] = 'https://hiplab.mc.vanderbilt.edu/git/aou/data-maker.git'
|
||||||
|
|
Loading…
Reference in New Issue