8.7 KiB
8.7 KiB
None
<html lang="en">
<head>
</head>
</html>
In [4]:
""" This notebook is intended to show how to use the risk framework: There are two basic usages: 1. Experiment Here the framework will select a number of random fields other than the patient id and compute risk for the selection. This will repeat over a designated number of runs. The parameters to pass to enable this mode are id=<patient id>,nun_runs=<number of runs> 2. Assessment Here the framework assumes you are only interested in a list of quasi identifiers and will run the evaluation once for a given list of quasi identifiers. The parameters to enable this mode are id=<patient id>,quasi_id=<list of quasi ids> """ import os import pandas as pd import numpy as np # #-- Loading a template file # The example taken a de-identification white-paper # http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf # import pandas as pd import numpy as np from io import StringIO csv = """ id,sex,age,profession,drug_test 1,M,37,doctor,- 2,F,28,doctor,+ 3,M,37,doctor,- 4,M,28,doctor,+ 5,M,28,doctor,- 6,M,37,doctor,- """ f = StringIO() f.write(unicode(csv)) f.seek(0) MY_DATAFRAME = pd.read_csv(f)
In [2]:
""" Here's the pandas_risk code verbatim. NOTE: """ @pd.api.extensions.register_dataframe_accessor("deid") class deid : """ This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe """ def __init__(self,df): self._df = df def risk(self,**args): """ @param id name of patient field @params num_runs number of runs (default will be 100) @params quasi_id list of quasi identifiers to be used (this will only perform a single run) """ id = args['id'] if 'quasi_id' in args : num_runs = 1 columns = list(set(args['quasi_id'])- set(id) ) else : num_runs = args['num_runs'] if 'num_runs' in args else 100 columns = list(set(self._df.columns) - set([id])) r = pd.DataFrame() k = len(columns) for i in range(0,num_runs) : # # let's chose a random number of columns and compute marketer and prosecutor risk # Once the fields are selected we run a groupby clause # if 'quasi_id' not in args : n = np.random.randint(2,k) #-- number of random fields we are picking ii = np.random.choice(k,n,replace=False) cols = np.array(columns)[ii].tolist() else: cols = columns n = len(cols) x_ = self._df.groupby(cols).count()[id].values r = r.append( pd.DataFrame( [ { "selected":n, "marketer": x_.size / np.float64(np.sum(x_)), "prosecutor":1 / np.float64(np.min(x_)) } ] ) ) g_size = x_.size n_ids = np.float64(np.sum(x_)) return r
In [7]:
# # Lets us compute risk here for a random any random selection of quasi identifiers # We will run this experiment 5 times # MY_DATAFRAME.deid.risk(id='id',num_runs=5)
Out[7]:
marketer | prosecutor | selected | |
---|---|---|---|
0 | 0.500000 | 1.0 | 2 |
0 | 0.500000 | 1.0 | 3 |
0 | 0.500000 | 1.0 | 3 |
0 | 0.333333 | 1.0 | 2 |
0 | 0.333333 | 0.5 | 2 |
In [8]:
# # In this scenario we are just interested in sex,profession,age # MY_DATAFRAME.deid.risk(id='id',quasi_id=['age','sex','profession'])
Out[8]:
marketer | prosecutor | selected | |
---|---|---|---|
0 | 0.5 | 1.0 | 3 |
In [ ]: