privacykit/risk.ipynb at 0b16ce94ccf6361899e6a225d7afdee4191979df

8.7 KiB

Raw Blame History

None <html lang="en"> <head> </head>

In [4]:

"""
    This notebook is intended to show how to use the risk framework:
    There are two basic usages:
        1. Experiment
            
            Here the framework will select a number of random fields other than the patient id and compute risk for the selection.
            This will repeat over a designated number of runs.
            
            The parameters to pass to enable this mode are id=<patient id>,nun_runs=<number of runs>
        2. Assessment
        
            Here the framework assumes you are only interested in a list of quasi identifiers and will run the evaluation once for a given list of quasi identifiers.
            The parameters to enable this mode are id=<patient id>,quasi_id=<list of quasi ids>
"""
import os
import pandas as pd
import numpy as np


#
#-- Loading a template file
# The example taken a de-identification white-paper
# http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf
#

import pandas as pd
import numpy as np
from io import StringIO
csv = """
id,sex,age,profession,drug_test
1,M,37,doctor,-
2,F,28,doctor,+
3,M,37,doctor,-
4,M,28,doctor,+
5,M,28,doctor,-
6,M,37,doctor,-
"""
f = StringIO()
f.write(unicode(csv))
f.seek(0)
MY_DATAFRAME = pd.read_csv(f)

In [2]:

"""
    Here's the pandas_risk code verbatim. 
    NOTE: 
"""
@pd.api.extensions.register_dataframe_accessor("deid")
class deid :
    """
        This class is a deidentification class that will compute risk (marketer, prosecutor) given a pandas dataframe
    """
    def __init__(self,df):
        self._df = df
    
    def risk(self,**args):
        """
            @param  id          name of patient field            
            @params num_runs    number of runs (default will be 100)
            @params quasi_id 	list of quasi identifiers to be used (this will only perform a single run)
        """
        
        id  = args['id']
        if 'quasi_id' in args :
            num_runs = 1
            columns = list(set(args['quasi_id'])- set(id) )
        else :
            num_runs  = args['num_runs'] if 'num_runs' in args else 100
            columns = list(set(self._df.columns) - set([id]))
        r   = pd.DataFrame()        
        k = len(columns)
        for i in range(0,num_runs) :
            #
            # let's chose a random number of columns and compute marketer and prosecutor risk
            # Once the fields are selected we run a groupby clause
            #
            if 'quasi_id' not in args :
                n   = np.random.randint(2,k) #-- number of random fields we are picking
                ii = np.random.choice(k,n,replace=False)
                cols = np.array(columns)[ii].tolist()
            else:
                cols 	= columns
                n 	= len(cols)
            x_ = self._df.groupby(cols).count()[id].values
            r = r.append(
                pd.DataFrame(
                    [
                        {
                            "selected":n,
                            "marketer": x_.size / np.float64(np.sum(x_)),
                            "prosecutor":1 / np.float64(np.min(x_))

                        }
                    ]
                )
            )
            g_size = x_.size
            n_ids = np.float64(np.sum(x_))

        return r

In [7]:

#
# Lets us compute risk here for a random any random selection of quasi identifiers
# We will run this experiment 5 times
#
MY_DATAFRAME.deid.risk(id='id',num_runs=5)

Out[7]:

marketer	prosecutor	selected
0.500000	1.0	2
0.500000	1.0	3
0.500000	1.0	3
0.333333	1.0	2
0.333333	0.5	2

In [8]:

#
# In this scenario we are just interested in sex,profession,age
#
MY_DATAFRAME.deid.risk(id='id',quasi_id=['age','sex','profession'])

Out[8]:

	marketer	prosecutor	selected
0	0.5	1.0	3

In [ ]:

</html>

8.7 KiB Raw Blame History

8.7 KiB

Raw Blame History