Refactored, including population risk assessment
This commit is contained in:
parent
6863df382e
commit
c3066408c9
|
@ -22,16 +22,108 @@
|
|||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
import time
|
||||
@pd.api.extensions.register_dataframe_accessor("deid")
|
||||
class deid :
|
||||
"""
|
||||
This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
|
||||
"""
|
||||
def __init__(self,df):
|
||||
self._df = df
|
||||
self._df = df.fillna(' ')
|
||||
def explore(self,**args):
|
||||
"""
|
||||
This function will perform experimentation by performing a random policies (combinations of attributes)
|
||||
This function is intended to explore a variety of policies and evaluate their associated risk.
|
||||
|
||||
def risk(self,**args):
|
||||
@param pop|sample data-frame with popublation reference
|
||||
@param id key field that uniquely identifies patient/customer ...
|
||||
"""
|
||||
# id = args['id']
|
||||
pop= args['pop'] if 'pop' in args else None
|
||||
# if 'columns' in args :
|
||||
# cols = args['columns']
|
||||
# params = {"sample":args['data'],"cols":cols}
|
||||
# if pop is not None :
|
||||
# params['pop'] = pop
|
||||
# return self.evaluate(**params)
|
||||
# else :
|
||||
#
|
||||
# Policies will be generated with a number of runs
|
||||
#
|
||||
RUNS = args['num_runs'] if 'num_runs' in args else 5
|
||||
|
||||
sample = args['sample'] if 'sample' in args else pd.DataFrame(self._df)
|
||||
|
||||
k = sample.columns.size -1 if 'field_count' not in args else int(args['field_count'])
|
||||
columns = list(set(sample.columns.tolist()) - set([id]))
|
||||
o = pd.DataFrame()
|
||||
# pop = args['pop'] if 'pop' in args else None
|
||||
for i in np.arange(RUNS):
|
||||
n = np.random.randint(2,k)
|
||||
|
||||
cols = np.random.choice(columns,n,replace=False).tolist()
|
||||
params = {'sample':sample,'cols':cols}
|
||||
if pop is not None :
|
||||
params['pop'] = pop
|
||||
r = self.evaluate(**params)
|
||||
#
|
||||
# let's put the policy in place
|
||||
p = pd.DataFrame(1*sample.columns.isin(cols)).T
|
||||
p.columns = sample.columns
|
||||
o = o.append(r.join(p))
|
||||
|
||||
o.index = np.arange(o.shape[0]).astype(np.int64)
|
||||
|
||||
return o
|
||||
def evaluate(self,**args) :
|
||||
"""
|
||||
This function will compute the marketer, if a population is provided it will evaluate the marketer risk relative to both the population and sample
|
||||
@param smaple data-frame with the data to be processed
|
||||
@param policy the columns to be considered.
|
||||
@param pop population dataset
|
||||
@params flag user defined flag (no computation use)
|
||||
"""
|
||||
if (args and 'sample' not in args) or not args :
|
||||
x_i = pd.DataFrame(self._df)
|
||||
elif args and 'sample' in args :
|
||||
x_i = args['sample']
|
||||
if (args and 'cols' not in args) or not args :
|
||||
cols = x_i.columns.tolist()
|
||||
# cols = self._df.columns.tolist()
|
||||
elif args and 'cols' in args :
|
||||
cols = args['cols']
|
||||
flag = args['flag'] if 'flag' in args else 'UNFLAGGED'
|
||||
# if args and 'sample' in args :
|
||||
|
||||
# x_i = pd.DataFrame(self._df)
|
||||
# else :
|
||||
# cols = args['cols'] if 'cols' in args else self._df.columns.tolist()
|
||||
# x_i = x_i.groupby(cols,as_index=False).size().values
|
||||
x_i_values = x_i.groupby(cols,as_index=False).size().values
|
||||
SAMPLE_GROUP_COUNT = x_i_values.size
|
||||
SAMPLE_FIELD_COUNT = len(cols)
|
||||
SAMPLE_POPULATION = x_i_values.sum()
|
||||
|
||||
SAMPLE_MARKETER = SAMPLE_GROUP_COUNT / np.float64(SAMPLE_POPULATION)
|
||||
SAMPLE_PROSECUTOR = 1/ np.min(x_i_values).astype(np.float64)
|
||||
if 'pop' in args :
|
||||
Yi = args['pop']
|
||||
y_i= pd.DataFrame({"group_size":Yi.groupby(cols,as_index=False).size()}).reset_index()
|
||||
# y_i['group'] = pd.DataFrame({"group_size":args['pop'].groupby(cols,as_index=False).size().values}).reset_index()
|
||||
# x_i = pd.DataFrame({"group_size":x_i.groupby(cols,as_index=False).size().values}).reset_index()
|
||||
x_i = pd.DataFrame({"group_size":x_i.groupby(cols,as_index=False).size()}).reset_index()
|
||||
SAMPLE_RATIO = int(100 * x_i.size/args['pop'].shape[0])
|
||||
r = pd.merge(x_i,y_i,on=cols,how='inner')
|
||||
r['marketer'] = r.apply(lambda row: (row.group_size_x / np.float64(row.group_size_y)) /np.sum(x_i.group_size) ,axis=1)
|
||||
r['sample %'] = np.repeat(SAMPLE_RATIO,r.shape[0])
|
||||
r['tier'] = np.repeat(flag,r.shape[0])
|
||||
r['sample marketer'] = np.repeat(SAMPLE_MARKETER,r.shape[0])
|
||||
r = r.groupby(['sample %','tier','sample marketer'],as_index=False).sum()[['sample %','marketer','sample marketer','tier']]
|
||||
else:
|
||||
r = pd.DataFrame({"marketer":[SAMPLE_MARKETER],"prosecutor":[SAMPLE_PROSECUTOR],"field_count":[SAMPLE_FIELD_COUNT],"group_count":[SAMPLE_GROUP_COUNT]})
|
||||
return r
|
||||
|
||||
def _risk(self,**args):
|
||||
"""
|
||||
@param id name of patient field
|
||||
@params num_runs number of runs (default will be 100)
|
||||
|
@ -50,7 +142,7 @@ class deid :
|
|||
k = len(columns)
|
||||
N = self._df.shape[0]
|
||||
tmp = self._df.fillna(' ')
|
||||
np.random.seed(1)
|
||||
np.random.seed(int(time.time()) )
|
||||
for i in range(0,num_runs) :
|
||||
|
||||
#
|
||||
|
@ -85,6 +177,7 @@ class deid :
|
|||
[
|
||||
{
|
||||
"group_count":x_.size,
|
||||
|
||||
"patient_count":N,
|
||||
"field_count":n,
|
||||
"marketer": x_.size / np.float64(np.sum(x_)),
|
||||
|
|
231
src/risk.py
231
src/risk.py
|
@ -146,7 +146,7 @@ class utils :
|
|||
|
||||
return " ".join(SQL).replace(":fields"," , ".join(fields))
|
||||
|
||||
class risk :
|
||||
class SQLRisk :
|
||||
"""
|
||||
This class will handle the creation of an SQL query that computes marketer and prosecutor risk (for now)
|
||||
"""
|
||||
|
@ -186,102 +186,163 @@ class risk :
|
|||
|
||||
|
||||
|
||||
class UtilHandler :
|
||||
def __init__(self,**args) :
|
||||
"""
|
||||
@param path path to the service account file
|
||||
@param dataset input dataset name
|
||||
@param key_field key_field (e.g person_id)
|
||||
@param key_table
|
||||
|
||||
"""
|
||||
self.path = args['path']
|
||||
self.client = bq.Client.from_service_account_json(self.path)
|
||||
dataset = args['dataset']
|
||||
self.key = args['key_field']
|
||||
|
||||
self.mytools = utils(client = self.client)
|
||||
self.tables = self.mytools.get_tables(dataset=dataset,client=self.client,key=self.key)
|
||||
index = [ self.tables.index(item) for item in self.tables if item['name'] == args['key_table']] [0]
|
||||
if index != 0 :
|
||||
first = self.tables[0]
|
||||
aux = self.tables[index]
|
||||
self.tables[0] = aux
|
||||
self.tables[index] = first
|
||||
if 'filter' in args :
|
||||
self.tables = [item for item in self.tables if item['name'] in args['filter']]
|
||||
|
||||
|
||||
if 'action' in SYS_ARGS and SYS_ARGS['action'] in ['create','compute','migrate'] :
|
||||
def create_table(self,**args):
|
||||
"""
|
||||
@param path absolute filename to save the create statement
|
||||
|
||||
path = SYS_ARGS['path']
|
||||
client = bq.Client.from_service_account_json(path)
|
||||
i_dataset = SYS_ARGS['i_dataset']
|
||||
key = SYS_ARGS['key']
|
||||
|
||||
mytools = utils(client = client)
|
||||
tables = mytools.get_tables(dataset=i_dataset,client=client,key=key)
|
||||
# print len(tables)
|
||||
# tables = tables[:6]
|
||||
|
||||
if SYS_ARGS['action'] == 'create' :
|
||||
#usage:
|
||||
# create --i_dataset <in dataset> --key <patient id> --o_dataset <out dataset> --table <table|file> [--file] --path <bq JSON account file>
|
||||
#
|
||||
create_sql = mytools.get_sql(tables=tables,key=key) #-- The create statement
|
||||
o_dataset = SYS_ARGS['o_dataset']
|
||||
table = SYS_ARGS['table']
|
||||
if 'file' in SYS_ARGS :
|
||||
f = open(table+'.sql','w')
|
||||
"""
|
||||
create_sql = self.mytools.get_sql(tables=self.tables,key=self.key) #-- The create statement
|
||||
# o_dataset = SYS_ARGS['o_dataset']
|
||||
# table = SYS_ARGS['table']
|
||||
if 'path' in args:
|
||||
f = open(args['path'],'w')
|
||||
f.write(create_sql)
|
||||
f.close()
|
||||
else:
|
||||
job = bq.QueryJobConfig()
|
||||
job.destination = client.dataset(o_dataset).table(table)
|
||||
job.use_query_cache = True
|
||||
job.allow_large_results = True
|
||||
job.priority = 'BATCH'
|
||||
job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
||||
return create_sql
|
||||
def migrate_tables(self,**args):
|
||||
"""
|
||||
This function will migrate a table from one location to another
|
||||
The reason for migration is to be able to reduce a candidate table to only represent a patient by her quasi-identifiers.
|
||||
@param dataset target dataset
|
||||
"""
|
||||
o_dataset = args['dataset'] if 'dataset' in args else None
|
||||
p = []
|
||||
for table in self.tables:
|
||||
sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",self.mytools.get_filtered_table(table,self.key),") as ",table['name']])
|
||||
p.append(sql)
|
||||
if o_dataset :
|
||||
job = bq.QueryJobConfig()
|
||||
job.destination = self.client.dataset(o_dataset).table(table['name'])
|
||||
job.use_query_cache = True
|
||||
job.allow_large_results = True
|
||||
job.priority = 'INTERACTIVE'
|
||||
job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
||||
|
||||
r = client.query(create_sql,location='US',job_config=job)
|
||||
r = self.client.query(sql,location='US',job_config=job)
|
||||
|
||||
print [r.job_id,' ** ',r.state]
|
||||
elif SYS_ARGS['action'] == 'migrate' :
|
||||
#
|
||||
#
|
||||
print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
|
||||
return p
|
||||
|
||||
o_dataset = SYS_ARGS['o_dataset']
|
||||
for table in tables:
|
||||
sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",mytools.get_filtered_table(table,key),") as ",table['name']])
|
||||
print ""
|
||||
print sql
|
||||
print ""
|
||||
# job = bq.QueryJobConfig()
|
||||
# job.destination = client.dataset(o_dataset).table(table['name'])
|
||||
# job.use_query_cache = True
|
||||
# job.allow_large_results = True
|
||||
# job.priority = 'INTERACTIVE'
|
||||
# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
||||
# if 'action' in SYS_ARGS and SYS_ARGS['action'] in ['create','compute','migrate'] :
|
||||
|
||||
# r = client.query(sql,location='US',job_config=job)
|
||||
# path = SYS_ARGS['path']
|
||||
# client = bq.Client.from_service_account_json(path)
|
||||
# i_dataset = SYS_ARGS['i_dataset']
|
||||
# key = SYS_ARGS['key']
|
||||
|
||||
# print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
|
||||
# mytools = utils(client = client)
|
||||
# tables = mytools.get_tables(dataset=i_dataset,client=client,key=key)
|
||||
# # print len(tables)
|
||||
# # tables = tables[:6]
|
||||
|
||||
# if SYS_ARGS['action'] == 'create' :
|
||||
# #usage:
|
||||
# # create --i_dataset <in dataset> --key <patient id> --o_dataset <out dataset> --table <table|file> [--file] --path <bq JSON account file>
|
||||
# #
|
||||
# create_sql = mytools.get_sql(tables=tables,key=key) #-- The create statement
|
||||
# o_dataset = SYS_ARGS['o_dataset']
|
||||
# table = SYS_ARGS['table']
|
||||
# if 'file' in SYS_ARGS :
|
||||
# f = open(table+'.sql','w')
|
||||
# f.write(create_sql)
|
||||
# f.close()
|
||||
# else:
|
||||
# job = bq.QueryJobConfig()
|
||||
# job.destination = client.dataset(o_dataset).table(table)
|
||||
# job.use_query_cache = True
|
||||
# job.allow_large_results = True
|
||||
# job.priority = 'BATCH'
|
||||
# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
||||
|
||||
# r = client.query(create_sql,location='US',job_config=job)
|
||||
|
||||
# print [r.job_id,' ** ',r.state]
|
||||
# elif SYS_ARGS['action'] == 'migrate' :
|
||||
# #
|
||||
# #
|
||||
|
||||
# o_dataset = SYS_ARGS['o_dataset']
|
||||
# for table in tables:
|
||||
# sql = " ".join(["SELECT ",",".join(table['fields']) ," FROM (",mytools.get_filtered_table(table,key),") as ",table['name']])
|
||||
# print ""
|
||||
# print sql
|
||||
# print ""
|
||||
# # job = bq.QueryJobConfig()
|
||||
# # job.destination = client.dataset(o_dataset).table(table['name'])
|
||||
# # job.use_query_cache = True
|
||||
# # job.allow_large_results = True
|
||||
# # job.priority = 'INTERACTIVE'
|
||||
# # job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
||||
|
||||
# # r = client.query(sql,location='US',job_config=job)
|
||||
|
||||
# # print [table['full_name'],' ** ',r.job_id,' ** ',r.state]
|
||||
|
||||
|
||||
pass
|
||||
else:
|
||||
#
|
||||
#
|
||||
tables = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]
|
||||
limit = int(SYS_ARGS['limit']) if 'limit' in SYS_ARGS else 1
|
||||
if tables :
|
||||
risk= risk()
|
||||
df = pd.DataFrame()
|
||||
dfs = pd.DataFrame()
|
||||
np.random.seed(1)
|
||||
for i in range(0,limit) :
|
||||
r = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
|
||||
sql = r['sql']
|
||||
dfs = dfs.append(r['stream'],sort=True)
|
||||
df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard').join(dfs))
|
||||
# df = df.join(dfs,sort=True)
|
||||
df.to_csv(SYS_ARGS['table']+'.csv')
|
||||
# dfs.to_csv(SYS_ARGS['table']+'_stream.csv')
|
||||
print [i,' ** ',df.shape[0],pd.DataFrame(r['stream']).shape]
|
||||
time.sleep(2)
|
||||
# pass
|
||||
# else:
|
||||
# #
|
||||
# #
|
||||
# tables = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]
|
||||
# limit = int(SYS_ARGS['limit']) if 'limit' in SYS_ARGS else 1
|
||||
# if tables :
|
||||
# risk= risk()
|
||||
# df = pd.DataFrame()
|
||||
# dfs = pd.DataFrame()
|
||||
# np.random.seed(1)
|
||||
# for i in range(0,limit) :
|
||||
# r = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
|
||||
# sql = r['sql']
|
||||
# dfs = dfs.append(r['stream'],sort=True)
|
||||
# df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard').join(dfs))
|
||||
# # df = df.join(dfs,sort=True)
|
||||
# df.to_csv(SYS_ARGS['table']+'.csv')
|
||||
# # dfs.to_csv(SYS_ARGS['table']+'_stream.csv')
|
||||
# print [i,' ** ',df.shape[0],pd.DataFrame(r['stream']).shape]
|
||||
# time.sleep(2)
|
||||
|
||||
|
||||
else:
|
||||
print 'ERROR'
|
||||
pass
|
||||
# else:
|
||||
# print 'ERROR'
|
||||
# pass
|
||||
|
||||
# r = risk(path='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json', i_dataset='raw',o_dataset='risk_o',o_table='mo')
|
||||
# tables = r.get_tables('raw','person_id')
|
||||
# sql = r.get_sql(tables=tables[:3],key='person_id')
|
||||
# #
|
||||
# # let's post this to a designated location
|
||||
# #
|
||||
# f = open('foo.sql','w')
|
||||
# f.write(sql)
|
||||
# f.close()
|
||||
# r.get_sql(tables=tables,key='person_id')
|
||||
# p = r.compute()
|
||||
# print p
|
||||
# p.to_csv("risk.csv")
|
||||
# r.write('foo.sql')
|
||||
# # r = risk(path='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json', i_dataset='raw',o_dataset='risk_o',o_table='mo')
|
||||
# # tables = r.get_tables('raw','person_id')
|
||||
# # sql = r.get_sql(tables=tables[:3],key='person_id')
|
||||
# # #
|
||||
# # # let's post this to a designated location
|
||||
# # #
|
||||
# # f = open('foo.sql','w')
|
||||
# # f.write(sql)
|
||||
# # f.close()
|
||||
# # r.get_sql(tables=tables,key='person_id')
|
||||
# # p = r.compute()
|
||||
# # print p
|
||||
# # p.to_csv("risk.csv")
|
||||
# # r.write('foo.sql')
|
||||
|
|
Loading…
Reference in New Issue