modular features, documentation new version

This commit is contained in:
Steve Nyemba 2022-10-12 08:23:04 -05:00
parent 3c643eb4df
commit 7f8754b5f1
3 changed files with 166 additions and 23 deletions

View File

@ -3,6 +3,8 @@
This framework computes re-identification risk of a dataset by extending pandas. It works like a pandas **add-on** This framework computes re-identification risk of a dataset by extending pandas. It works like a pandas **add-on**
The framework will compute the following risk measures: marketer, prosecutor, journalist and pitman risk. References for the risk measures can be found on [http://ehelthinformation.ca] (http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf) and [https://www.scb.se/contentassets](https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf) The framework will compute the following risk measures: marketer, prosecutor, journalist and pitman risk. References for the risk measures can be found on [http://ehelthinformation.ca] (http://www.ehealthinformation.ca/wp-content/uploads/2014/08/2009-De-identification-PA-whitepaper1.pdf) and [https://www.scb.se/contentassets](https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf)
There are two modes available : There are two modes available :
**explore:** **explore:**
@ -16,10 +18,10 @@ Here the assumption is that we are clear on the sets of attributes to be used an
### Four risk measures are computed : ### Four risk measures are computed :
- Marketer risk - Marketer risk
- Prosecutor risk - Prosecutor risk
- Journalist risk - Journalist risk
- Pitman Risk - Pitman Risk [Video tutorial,by Dr. Weiyi Xia](https://www.loom.com/share/173e109ecac64d37a54f09b103bc6681) and [Publication by Dr. Nobuaki Hoshino](https://www.scb.se/contentassets/ff271eeeca694f47ae99b942de61df83/applying-pitmans-sampling-formula-to-microdata-disclosure-risk-assessment.pdf)
### Usage: ### Usage:

View File

@ -43,6 +43,10 @@ from datetime import datetime
import sys import sys
from itertools import combinations from itertools import combinations
# class Compute:
# pass
# class Population(Compute):
# pass
@pd.api.extensions.register_dataframe_accessor("risk") @pd.api.extensions.register_dataframe_accessor("risk")
class deid : class deid :
@ -57,6 +61,16 @@ class deid :
# #
values = df.apply(lambda col: col.unique().size / df.shape[0]) values = df.apply(lambda col: col.unique().size / df.shape[0])
self._dinfo = dict(zip(df.columns.tolist(),values)) self._dinfo = dict(zip(df.columns.tolist(),values))
# self.sample = self._df
self.init(sample=self._df)
def init(self,**_args):
_sample = _args['sample'] if 'sample' in _args else self._df
_columns = [] if 'columns' not in _args else _args['columns']
if _columns :
self._compute = Compute(sample = _sample,columns=_columns)
else:
self._comput = Compute(sample=_sample)
self._pcompute= Population()
def explore(self,**args): def explore(self,**args):
""" """
@ -115,7 +129,9 @@ class deid :
p = pd.DataFrame(1*sample.columns.isin(cols)).T p = pd.DataFrame(1*sample.columns.isin(cols)).T
p.columns = sample.columns p.columns = sample.columns
o = pd.concat([o,r.join(p)]) o = pd.concat([o,r.join(p)])
o['attr'] = ','.join(cols)
o['attributes'] = ','.join(cols)
# o['attr'] = ','.join(r.apply())
_index += 1 _index += 1
# #
# We rename flags to policies and adequately number them, we also have a column to summarize the attributes attr # We rename flags to policies and adequately number them, we also have a column to summarize the attributes attr
@ -127,7 +143,23 @@ class deid :
o.index = np.arange(o.shape[0]).astype(np.int64) o.index = np.arange(o.shape[0]).astype(np.int64)
o = o.rename(columns={'flag':'policies'}) o = o.rename(columns={'flag':'policies'})
return o return o
def evaluate(self, **args): def evaluate(self,**_args):
_measure = {}
self.init(**_args)
_names = ['marketer','journalist','prosecutor'] #+ (['pitman'] if 'pop_size' in _args else [])
for label in _names :
_pointer = getattr(self,label)
_measure[label] = _pointer(**_args)
_measure['fields'] = self._compute.cache['count']['fields']
_measure['groups'] = self._compute.cache['count']['groups']
_measure['rows'] = self._compute.cache['count']['rows']
if 'attr' in _args :
_measure = dict(_args['attr'],**_measure)
return pd.DataFrame([_measure])
def _evaluate(self, **args):
""" """
This function has the ability to evaluate risk associated with either a population or a sample dataset This function has the ability to evaluate risk associated with either a population or a sample dataset
:sample sample dataset :sample sample dataset
@ -157,7 +189,7 @@ class deid :
r = {"flag":flag} r = {"flag":flag}
# if sample : # if sample :
handle_sample = Sample() handle_sample = Compute()
xi = sample.groupby(cols,as_index=False).count().values xi = sample.groupby(cols,as_index=False).count().values
handle_sample.set('groups',xi) handle_sample.set('groups',xi)
@ -214,6 +246,82 @@ class deid :
r['field count'] = len(cols) r['field count'] = len(cols)
return pd.DataFrame([r]) return pd.DataFrame([r])
def marketer(self,**_args):
"""
This function delegates the calls to compute marketer risk of a given dataset or sample
:sample optional sample dataset
:columns optional columns of the dataset, if non is provided and inference will be made using non-unique columns
"""
if 'pop' not in _args :
if not 'sample' in _args and not 'columns' in _args :
# _handler = self._compute
pass
else:
self.init(**_args)
# _handler = Compute(**_args)
_handler = self._compute
else:
#
# Computing population estimates for the population
self._pcompute.init(**_args)
handler = self._pcompute
return _handler.marketer()
def journalist(self,**_args):
"""
This function delegates the calls to compute journalist risk of a given dataset or sample
:sample optional sample dataset
:columns optional columns of the dataset, if non is provided and inference will be made using non-unique columns
"""
if 'pop' not in _args :
if not 'sample' in _args and not 'columns' in _args :
_handler = self._compute
else:
self.init(**_args)
# _handler = Compute(**_args)
_handler = self._compute
# return _compute.journalist()
else:
self._pcompute.init(**_args)
_handler = self._pcompute
return _handler.journalist()
def prosecutor(self,**_args):
"""
This function delegates the calls to compute prosecutor risk of a given dataset or sample
:sample optional sample dataset
:columns optional columns of the dataset, if non is provided and inference will be made using non-unique columns
"""
if 'pop' not in _args :
if not 'sample' in _args and not 'columns' in _args :
# _handler = self._compute
pass
else:
self.init(**_args)
# _handler = Compute(**_args)
_handler = self._compute
else:
self._pcompute.init(**_args)
_handler = self._pcompute
return _handler.prosecutor()
def pitman(self,**_args):
if 'population' not in _args :
pop_size = int(_args['pop_size'])
self._compute.set('pop_size',pop_size)
_handler = self._compute;
else:
self._pcompute.init(**_args)
_handler = self._pcompute
return _handler.pitman()
# xi = pd.DataFrame({"sample_group_size":sample.groupby(cols,as_index=False).count()}).reset_index()
# yi = pd.DataFrame({"population_group_size":args['pop'].groupby(cols,as_index=False).size()}).reset_index()
# merged_groups = pd.merge(xi,yi,on=cols,how='inner')
# handle_population= Population()
# handle_population.set('merged_groups',merged_groups)
class Risk : class Risk :
""" """
This class is an abstraction of how we chose to structure risk computation i.e in 2 sub classes: This class is an abstraction of how we chose to structure risk computation i.e in 2 sub classes:
@ -227,13 +335,31 @@ class Risk :
self.cache[id] = {} self.cache[id] = {}
self.cache[key] = value self.cache[key] = value
class Sample(Risk): class Compute(Risk):
""" """
This class will compute risk for the sample dataset: the marketer and prosecutor risk are computed by default. This class will compute risk for the sample dataset: the marketer and prosecutor risk are computed by default.
This class can optionally add pitman risk if the population size is known. This class can optionally add pitman risk if the population size is known.
""" """
def __init__(self): def __init__(self,**_args):
Risk.__init__(self) super().__init__()
self._sample = _args['sample'] if 'sample' in _args else pd.DataFrame()
self._columns= _args['columns'] if 'columns' in _args else None
self.cache['count'] = {'groups':0,'fields':0,'rows':0}
if not self._columns :
values = self._sample.apply(lambda col: col.unique().size / self._sample.shape[0])
self._dinfo = dict(zip(self._sample.columns.tolist(),values))
self._columns = [key for key in self._dinfo if self._dinfo[key] < 1]
#
# At this point we have all the columns that are valid candidates even if the user didn't specify them
self.cache['count']['fields'] = len(self._columns)
if self._sample.shape[0] > 0 and self._columns:
_sample = _args ['sample']
_groups = self._sample.groupby(self._columns,as_index=False).count().values
self.set('groups',_groups)
self.cache['count']['groups'] = len(_groups)
self.cache['count']['rows'] = np.sum([_g[-1] for _g in _groups])
def marketer(self): def marketer(self):
""" """
computing marketer risk for sample dataset computing marketer risk for sample dataset
@ -243,8 +369,10 @@ class Sample(Risk):
groups = self.cache['groups'] groups = self.cache['groups']
# group_count = groups.size # group_count = groups.size
# row_count = groups.sum() # row_count = groups.sum()
group_count = len(groups) # group_count = len(groups)
row_count = np.sum([_g[-1] for _g in groups]) group_count = self.cache['count']['groups']
# row_count = np.sum([_g[-1] for _g in groups])
row_count = self.cache['count']['rows']
return group_count / np.float64(row_count) return group_count / np.float64(row_count)
def prosecutor(self): def prosecutor(self):
@ -259,40 +387,52 @@ class Sample(Risk):
def unique_ratio(self): def unique_ratio(self):
groups = self.cache['groups'] groups = self.cache['groups']
# row_count = groups.sum() # row_count = groups.sum()
row_count = np.sum([_g[-1] for _g in groups]) # row_count = np.sum([_g[-1] for _g in groups])
row_count = self.cache['count']['rows']
# return groups[groups == 1].sum() / np.float64(row_count) # return groups[groups == 1].sum() / np.float64(row_count)
values = [_g[-1] for _g in groups if _g[-1] == 1] values = [_g[-1] for _g in groups if _g[-1] == 1]
return np.sum(values) / np.float64(row_count) return np.sum(values) / np.float64(row_count)
def journalist(self):
return self.unique_ratio()
def pitman(self): def pitman(self):
""" """
This function will approximate pitman de-identification risk based on pitman sampling This function will approximate pitman de-identification risk based on pitman sampling
""" """
groups = self.cache['groups'] groups = self.cache['groups']
print (self.cache['pop_size'])
si = groups[groups == 1].size si = groups[groups == 1].size
# u = groups.size # u = groups.size
u = len(groups) u = len(groups)
alpha = np.divide(si , np.float64(u) ) alpha = np.divide(si , np.float64(u) )
row_count = np.sum([_g[-1] for _g in groups]) # row_count = np.sum([_g[-1] for _g in groups])
row_count = self.cache['count']['rows']
# f = np.divide(groups.sum(), np.float64(self.cache['pop_size'])) # f = np.divide(groups.sum(), np.float64(self.cache['pop_size']))
f = np.divide(row_count, np.float64(self.cache['pop_size'])) f = np.divide(row_count, np.float64(self.cache['pop_size']))
return np.power(f,1-alpha) return np.power(f,1-alpha)
class Population(Sample): class Population(Compute):
""" """
This class will compute risk for datasets that have population information or datasets associated with them. This class will compute risk for datasets that have population information or datasets associated with them.
This computation includes pitman risk (it requires minimal information about population) This computation includes pitman risk (it requires minimal information about population)
""" """
def __init__(self,**args): def __init__(self,**_args):
Sample.__init__(self) super().__init__(**_args)
def init(self,**_args):
xi = pd.DataFrame({"sample_group_size":self._sample.groupby(self._columns,as_index=False).count()}).reset_index()
yi = pd.DataFrame({"population_group_size":_args['population'].groupby(self._columns,as_index=False).size()}).reset_index()
merged_groups = pd.merge(xi,yi,on=self._columns,how='inner')
self.set('merged_groups',merged_groups)
def set(self,key,value): def set(self,key,value):
Sample.set(self,key,value) self.set(self,key,value)
if key == 'merged_groups' : if key == 'merged_groups' :
Sample.set(self,'pop_size',np.float64(value.population_group_size.sum()) ) self.set(self,'pop_size',np.float64(value.population_group_size.sum()) )
Sample.set(self,'groups',value.sample_group_size) self.set(self,'groups',value.sample_group_size)
""" """
This class will measure risk and account for the existance of a population This class will measure risk and account for the existance of a population
:merged_groups {sample_group_size, population_group_size} is a merged dataset with group sizes of both population and sample :merged_groups {sample_group_size, population_group_size} is a merged dataset with group sizes of both population and sample
@ -301,6 +441,7 @@ class Population(Sample):
""" """
This function requires This function requires
""" """
r = self.cache['merged_groups'] r = self.cache['merged_groups']
sample_row_count = r.sample_group_size.sum() sample_row_count = r.sample_group_size.sum()
# #

View File

@ -5,7 +5,7 @@ from setuptools import setup, find_packages
setup( setup(
name = "privacykit", name = "privacykit",
version = "0.8.1", version = "0.9.0",
author = "Healthcare/IO - The Phi Technology LLC & Health Information Privacy Lab", author = "Healthcare/IO - The Phi Technology LLC & Health Information Privacy Lab",
author_email = "info@the-phi.com", author_email = "info@the-phi.com",
license = "MIT", license = "MIT",