7.7 KiB
7.7 KiB
None
<html lang="en">
<head>
</head>
</html>
In [1]:
import itertools import pandas as pd import numpy as np # from pandas_risk import * from time import time import os attr = ['gender','race','zip','year_of_birth'] comb_attr = [ ['zip' ,'gender', 'birth_datetime', 'race'], ['zip', 'gender', 'year_of_birth', 'race'], ['gender','race','zip'], ['race','year_of_birth','zip'] ]
In [2]:
SQL_CONTROLLED="SELECT * FROM deid_risk.basic_risk60k" dfc = pd.read_gbq(SQL_CONTROLLED,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
In [3]:
def risk(**args): Yi = args['data'] Yi = Yi.fillna(' ') sizes = args['prop'] if 'prop' in args else np.arange(5,100,5) FLAG = args['flag'] if 'flag' in args else 'UNFLAGGED' N = args['num_runs'] if 'cols' in args : columns = args['cols'] else: columns = list(set(Yi.columns.tolist()) - set(['person_id'])) p = pd.DataFrame() y_i= pd.DataFrame({"group_size":Yi.groupby(columns,as_index=False).size()}).reset_index() for index in sizes : for n in np.repeat(index,N): # we will randomly sample n% rows from the dataset i = np.random.choice(Yi.shape[0],((Yi.shape[0] * n)/100),replace=False) x_i= pd.DataFrame(Yi).loc[i] risk = x_i.deid.risk(id='person_id',quasi_id = columns) x_i = pd.DataFrame({"group_size":x_i.groupby(columns,as_index=False).size()}).reset_index() r = pd.merge(x_i,y_i,on=columns,how='inner') if r.shape[0] == 0 : continue r['marketer'] = r.apply(lambda row: (row.group_size_x / np.float64(row.group_size_y)) /np.sum(x_i.group_size) ,axis=1) r['sample %'] = np.repeat(n,r.shape[0]) r['tier'] = np.repeat(FLAG,r.shape[0]) r['sample marketer'] = np.repeat(risk['marketer'].values[0],r.shape[0]) # r['patient_count'] = np.repeat(r.shape[0],r.shape[0]) r = r.groupby(['sample %','tier','sample marketer'],as_index=False).sum()[['sample %','marketer','sample marketer','tier']] p = p.append(r) p.index = np.arange(p.shape[0]).astype(np.int64) return p
In [4]:
from pandas_risk import * o = pd.DataFrame() PATH="out/experiment-phase-2.xlsx" writer = pd.ExcelWriter(PATH,engine='xlsxwriter') comb_attr = [ ['zip' ,'gender', 'birth_datetime', 'race'], ['zip', 'gender', 'year_of_birth', 'race'], ['gender','race','zip'], ['race','year_of_birth','zip'] ] for cols in comb_attr : o = risk(data=dfc,cols=cols,flag='CONTROLLED',num_runs=5) # # adding the policy x = [1* dfc.columns.isin(cols) for i in range(o.shape[0])] o = o.join(pd.DataFrame(x,columns = dfc.columns)) # # Write this to excel notebook o.to_excel(writer,"-".join(cols)) # break # p = p.rename(columns={'marketer_x':'sample marketer'}) # p.index = np.arange(p.shape[0]).astype(np.int64) writer.save()
In [20]:
x = [1* dfc.columns.isin(cols) for i in range(o.shape[0])] o.join(pd.DataFrame(x,columns = dfc.columns))
Out[20]:
person_id | year_of_birth | month_of_birth | day_of_birth | birth_datetime | race_concept_id | ethnicity_concept_id | location_id | care_site_id | person_source_value | ... | gender_source_concept_id | race_source_value | ethnicity_source_value | sex_at_birth | birth_date | race | zip | city | state | gender |
---|
0 rows × 21 columns
In [6]:
columns
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-6-8e7b9895361f> in <module>() ----> 1 columns NameError: name 'columns' is not defined
In [ ]: