bug fix: prosecutor risk, marketer risk

This commit is contained in:
Steve L. Nyemba -- The Architect 2018-09-27 10:33:52 -05:00
parent 18bfa63df1
commit 140a4c4573
3 changed files with 374 additions and 74 deletions

View File

@ -2,15 +2,29 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 66, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dev-deid-600@aou-res-deid-vumc-test.iam.gserviceaccount.com df0ac049-d5b6-416f-ab3c-6321eda919d6 2018-09-25 08:18:34.829000+00:00 DONE\n"
]
}
],
"source": [ "source": [
"import pandas as pd\n", "import pandas as pd\n",
"import numpy as np\n", "import numpy as np\n",
"from google.cloud import bigquery as bq\n", "from google.cloud import bigquery as bq\n",
"\n", "\n",
"client = bq.Client.from_service_account_json('/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')" "client = bq.Client.from_service_account_json('/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')\n",
"# pd.read_gbq(query=\"select * from raw.observation limit 10\",private_key='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')\n",
"jobs = client.list_jobs()\n",
"for job in jobs :\n",
"# print dir(job)\n",
" print job.user_email,job.job_id,job.started, job.state\n",
" break"
] ]
}, },
{ {
@ -25,7 +39,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 181, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -68,7 +82,7 @@
" else:\n", " else:\n",
" x_ = args['xi']\n", " x_ = args['xi']\n",
" for xi in x_ :\n", " for xi in x_ :\n",
" fields += (['.'.join([xi['name'],name]) for name in xi['fields'] if name != args['join']])\n", " fields += (['.'.join([xi['name'], name]) for name in xi['fields'] if name != args['join']])\n",
" return fields\n", " return fields\n",
"def generate_sql(**args):\n", "def generate_sql(**args):\n",
" \"\"\"\n", " \"\"\"\n",
@ -97,7 +111,27 @@
" tmp.append(ON_SQL)\n", " tmp.append(ON_SQL)\n",
" INNER_JOINS += [JOIN_SQL + \" AND \".join(tmp)]\n", " INNER_JOINS += [JOIN_SQL + \" AND \".join(tmp)]\n",
" return SQL + \" \".join(INNER_JOINS)\n", " return SQL + \" \".join(INNER_JOINS)\n",
" \n", "def get_final_sql(**args):\n",
" xo = args['xo']\n",
" xi = args['xi']\n",
" join=args['join']\n",
" prefix = args['prefix'] if 'prefix' in args else ''\n",
" fields = get_fields (xo=xo,xi=xi,join=join)\n",
" k = len(fields)\n",
" n = np.random.randint(2,k) #-- number of fields to select\n",
" i = np.random.randint(0,k,size=n)\n",
" fields = [name for name in fields if fields.index(name) in i]\n",
" base_sql = generate_sql(xo=xo,xi=xi,prefix)\n",
" SQL = \"\"\"\n",
" SELECT AVERAGE(count),size,n as selected_features,k as total_features\n",
" FROM(\n",
" SELECT COUNT(*) as count,count(:join) as pop,sum(:n) as N,sum(:k) as k,:fields\n",
" FROM (:sql)\n",
" GROUP BY :fields\n",
" ) \n",
" order by 1\n",
" \n",
" \"\"\".replace(\":sql\",base_sql)\n",
"# sql = \"SELECT :fields FROM :xo.name INNER JOIN :xi.name ON :xi.name.:xi.y = :xo.y \"\n", "# sql = \"SELECT :fields FROM :xo.name INNER JOIN :xi.name ON :xi.name.:xi.y = :xo.y \"\n",
"# fields = \",\".join(get_fields(xo=xi,xi=xi,join=xi['y']))\n", "# fields = \",\".join(get_fields(xo=xi,xi=xi,join=xi['y']))\n",
" \n", " \n",
@ -111,24 +145,39 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 183, "execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"xo = {\"name\":\"person\",\"fields\":['person_id','date_of_birth','race','value_as_number']}\n",
"xi = [{\"name\":\"measurement\",\"fields\":['person_id','value_as_number','value_source_value']}] #,{\"name\":\"observation\",\"fields\":[\"person_id\",\"value_as_string\",\"observation_source_value\"]}]\n",
"# generate_sql(xo=xo,xi=xi,join=\"person_id\",prefix='raw')\n",
"fields = get_fields(xo=xo,xi=xi,join='person_id')\n",
"ofields = list(fields)\n",
"k = len(fields)\n",
"n = np.random.randint(2,k) #-- number of fields to select\n",
"i = np.random.randint(0,k,size=n)\n",
"fields = [name for name in fields if fields.index(name) in i]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"'SELECT :fields FROM raw.person INNER JOIN raw.measurement ON measurement.person_id = person.person_id'" "['person.race', 'person.value_as_number', 'measurement.value_source_value']"
] ]
}, },
"execution_count": 183, "execution_count": 34,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"xo = {\"name\":\"person\",\"fields\":['person_id','date_of_birth','race']}\n", "fields\n"
"xi = [{\"name\":\"measurement\",\"fields\":['person_id','value_as_number','value_source_value']}] #,{\"name\":\"observation\",\"fields\":[\"person_id\",\"value_as_string\",\"observation_source_value\"]}]\n",
"generate_sql(xo=xo,xi=xi,join=\"person_id\",prefix='raw')"
] ]
}, },
{ {
@ -179,69 +228,16 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 111, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"[u'condition_occurrence.condition_occurrence_id',\n", "array([1, 3, 0, 0])"
" u'condition_occurrence.person_id',\n",
" u'condition_occurrence.condition_concept_id',\n",
" u'condition_occurrence.condition_start_date',\n",
" u'condition_occurrence.condition_start_datetime',\n",
" u'condition_occurrence.condition_end_date',\n",
" u'condition_occurrence.condition_end_datetime',\n",
" u'condition_occurrence.condition_type_concept_id',\n",
" u'condition_occurrence.stop_reason',\n",
" u'condition_occurrence.provider_id',\n",
" u'condition_occurrence.visit_occurrence_id',\n",
" u'condition_occurrence.condition_source_value',\n",
" u'condition_occurrence.condition_source_concept_id',\n",
" u'death.death_date',\n",
" u'death.death_datetime',\n",
" u'death.death_type_concept_id',\n",
" u'death.cause_concept_id',\n",
" u'death.cause_source_value',\n",
" u'death.cause_source_concept_id',\n",
" u'device_exposure.device_exposure_id',\n",
" u'device_exposure.device_concept_id',\n",
" u'device_exposure.device_exposure_start_date',\n",
" u'device_exposure.device_exposure_start_datetime',\n",
" u'device_exposure.device_exposure_end_date',\n",
" u'device_exposure.device_exposure_end_datetime',\n",
" u'device_exposure.device_type_concept_id',\n",
" u'device_exposure.unique_device_id',\n",
" u'device_exposure.quantity',\n",
" u'device_exposure.provider_id',\n",
" u'device_exposure.visit_occurrence_id',\n",
" u'device_exposure.device_source_value',\n",
" u'device_exposure.device_source_concept_id',\n",
" u'drug_exposure.drug_exposure_id',\n",
" u'drug_exposure.drug_concept_id',\n",
" u'drug_exposure.drug_exposure_start_date',\n",
" u'drug_exposure.drug_exposure_start_datetime',\n",
" u'drug_exposure.drug_exposure_end_date',\n",
" u'drug_exposure.drug_exposure_end_datetime',\n",
" u'drug_exposure.drug_type_concept_id',\n",
" u'drug_exposure.stop_reason',\n",
" u'drug_exposure.refills',\n",
" u'drug_exposure.quantity',\n",
" u'drug_exposure.days_supply',\n",
" u'drug_exposure.sig',\n",
" u'drug_exposure.route_concept_id',\n",
" u'drug_exposure.effective_drug_dose',\n",
" u'drug_exposure.dose_unit_concept_id',\n",
" u'drug_exposure.lot_number',\n",
" u'drug_exposure.provider_id',\n",
" u'drug_exposure.visit_occurrence_id',\n",
" u'drug_exposure.drug_source_value',\n",
" u'drug_exposure.drug_source_concept_id',\n",
" u'drug_exposure.route_source_value',\n",
" u'drug_exposure.dose_unit_source_value']"
] ]
}, },
"execution_count": 111, "execution_count": 6,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -250,12 +246,7 @@
"#\n", "#\n",
"# find every table with person id at the very least or a subset of fields\n", "# find every table with person id at the very least or a subset of fields\n",
"#\n", "#\n",
"info = get_tables(client,'raw',['person_id'])\n", "np.random.randint(0,4,size=4)"
"# get_fields(xo=names[0],xi=names[1:4],join='person_id')\n",
"\n",
"# q = ['person_id']\n",
"# pairs = list(itertools.combinations(names,len(names)))\n",
"# pairs[0]"
] ]
}, },
{ {
@ -287,6 +278,72 @@
"x_ = 1" "x_ = 1"
] ]
}, },
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"x_ = pd.DataFrame({\"group\":[1,1,1,1,1], \"size\":[2,1,1,1,1]})"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>size</th>\n",
" </tr>\n",
" <tr>\n",
" <th>group</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" size\n",
"group \n",
"1 1.2"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x_.groupby(['group']).mean()\n"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,

17
src/params.py Normal file
View File

@ -0,0 +1,17 @@
import sys
SYS_ARGS={}
if len(sys.argv) > 1 :
N = len(sys.argv)
for i in range(1,N) :
value = 1
if sys.argv[i].startswith('--') :
key = sys.argv[i].replace('-','')
if i + 1 < N and not sys.argv[i+1].startswith('--') :
value = sys.argv[i + 1].strip()
SYS_ARGS[key] = value
i += 2
elif 'action' not in SYS_ARGS:
SYS_ARGS['action'] = sys.argv[i].strip()

226
src/risk.py Normal file
View File

@ -0,0 +1,226 @@
"""
Steve L. Nyemba & Brad Malin
Health Information Privacy Lab.
This code is proof of concept as to how risk is computed against a database (at least a schema).
The engine will read tables that have a given criteria (patient id) and generate a dataset by performing joins.
Because joins are process intensive we decided to add a limit to the records pulled.
TL;DR:
This engine generates a dataset and computes risk (marketer and prosecutor)
Assumptions:
- We assume tables that reference patients will name the keys identically (best practice). This allows us to be able to leverage data store's that don't support referential integrity
Usage :
Limitations
- It works against bigquery for now
@TODO:
- Need to write a transport layer (database interface)
- Support for referential integrity, so one table can be selected and a dataset derived given referential integrity
- Add support for journalist risk
"""
import pandas as pd
import numpy as np
from google.cloud import bigquery as bq
import time
from params import SYS_ARGS
class utils :
"""
This class is a utility class that will generate SQL-11 compatible code in order to run the risk assessment
@TODO: plugins for other data-stores
"""
def __init__(self,**args):
# self.path = args['path']
self.client = args['client']
def get_tables(self,**args): #id,key='person_id'):
"""
This function returns a list of tables given a key. The key is the name of the field that uniquely designates a patient/person
in the database. The list of tables are tables that can be joined given the provided field.
@param key name of the patient field
@param dataset dataset name
@param client initialized bigquery client ()
@return [{name,fields:[],row_count}]
"""
dataset = args['dataset']
client = args['client']
key = args['key']
r = []
ref = client.dataset(dataset)
tables = list(client.list_tables(ref))
for table in tables :
if table.table_id.strip() in ['people_seed']:
print ' skiping ...'
continue
ref = table.reference
table = client.get_table(ref)
schema = table.schema
rows = table.num_rows
if rows == 0 :
continue
names = [f.name for f in schema]
x = list(set(names) & set([key]))
if x :
full_name = ".".join([dataset,table.table_id])
r.append({"name":table.table_id,"fields":names,"row_count":rows,"full_name":full_name})
return r
def get_field_name(self,alias,field_name,index):
"""
This function will format the a field name given an index (the number of times it has occurred in projection)
The index is intended to avoid a "duplicate field" error (bigquery issue)
@param alias alias of the table
@param field_name name of the field to be formatted
@param index the number of times the field appears in the projection
"""
name = [alias,field_name]
if index > 0 :
return ".".join(name)+" AS :field_name:index".replace(":field_name",field_name).replace(":index",str(index))
else:
return ".".join(name)
def get_sql(self,**args):
"""
This function will generate that will join a list of tables given a key and a limit of records
@param tables list of tables
@param key key field to be used in the join. The assumption is that the field name is identical across tables (best practice!)
@param limit a limit imposed, in case of ristrictions considering joins are resource intensive
"""
tables = args['tables']
key = args['key']
limit = args['limit'] if 'limit' in args else 300000
limit = str(limit)
SQL = [
"""
SELECT :fields
FROM
"""]
fields = []
prev_table = None
for table in tables :
name = table['full_name'] #".".join([self.i_dataset,table['name']])
alias= table['name']
index = tables.index(table)
sql_ = """
(select * from :name limit :limit) as :alias
""".replace(":limit",limit)
sql_ = sql_.replace(":name",name).replace(":alias",alias)
fields += [self.get_field_name(alias,field_name,index) for field_name in table['fields'] if field_name != key or (field_name==key and tables.index(table) == 0) ]
if tables.index(table) > 0 :
join = """
INNER JOIN :sql ON :alias.:field = :prev_alias.:field
""".replace(":name",name)
join = join.replace(":alias",alias).replace(":field",key).replace(":prev_alias",prev_alias)
sql_ = join.replace(":sql",sql_)
# sql_ = " ".join([sql_,join])
SQL += [sql_]
if index == 0:
prev_alias = str(alias)
return " ".join(SQL).replace(":fields"," , ".join(fields))
class risk :
"""
This class will handle the creation of an SQL query that computes marketer and prosecutor risk (for now)
"""
def __init__(self):
pass
def get_sql(self,**args) :
"""
This function returns the SQL Query that will compute marketer and prosecutor risk
@param key key fields (patient identifier)
@param table table that is subject of the computation
"""
key = args['key']
table = args['table']
fields = list(set(table['fields']) - set([key]))
#-- We need to select n-fields max 64
k = len(fields)
n = np.random.randint(2,24) #-- how many random fields are we processing
ii = np.random.choice(k,n,replace=False)
fields = list(np.array(fields)[ii])
sql = """
SELECT COUNT(g_size) as group_count, SUM(g_size) as patient_count, COUNT(g_size)/SUM(g_size) as marketer, 1/ MIN(g_size) as prosecutor
FROM (
SELECT COUNT(*) as g_size,:key,:fields
FROM :full_name
GROUP BY :key,:fields
)
""".replace(":fields", ",".join(fields)).replace(":full_name",table['full_name']).replace(":key",key).replace(":n",str(n))
return sql
if 'action' in SYS_ARGS and SYS_ARGS['action'] in ['create','compute'] :
path = SYS_ARGS['path']
client = bq.Client.from_service_account_json(path)
i_dataset = SYS_ARGS['i_dataset']
key = SYS_ARGS['key']
mytools = utils(client = client)
tables = mytools.get_tables(dataset=i_dataset,client=client,key=key)
# print len(tables)
# tables = tables[:6]
if SYS_ARGS['action'] == 'create' :
#usage:
# create --i_dataset <in dataset> --key <patient id> --o_dataset <out dataset> --table <table|file> [--file] --path <bq JSON account file>
#
create_sql = mytools.get_sql(tables=tables,key=key) #-- The create statement
o_dataset = SYS_ARGS['o_dataset']
table = SYS_ARGS['table']
if 'file' in SYS_ARGS :
f = open(table+'.sql','w')
f.write(create_sql)
f.close()
else:
job = bq.QueryJobConfig()
job.destination = client.dataset(o_dataset).table(table)
job.use_query_cache = True
job.allow_large_results = True
job.priority = 'BATCH'
job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
r = client.query(create_sql,location='US',job_config=job)
print [r.job_id,' ** ',r.state]
else:
#
#
tables = [tab for tab in tables if tab['name'] == SYS_ARGS['table'] ]
if tables :
risk = risk()
df = pd.DataFrame()
for i in range(0,10) :
sql = risk.get_sql(key=SYS_ARGS['key'],table=tables[0])
df = df.append(pd.read_gbq(query=sql,private_key=path,dialect='standard'))
df.to_csv(SYS_ARGS['table']+'.csv')
print [i,' ** ',df.shape[0]]
time.sleep(2)
pass
else:
print 'ERROR'
pass
# r = risk(path='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json', i_dataset='raw',o_dataset='risk_o',o_table='mo')
# tables = r.get_tables('raw','person_id')
# sql = r.get_sql(tables=tables[:3],key='person_id')
# #
# # let's post this to a designated location
# #
# f = open('foo.sql','w')
# f.write(sql)
# f.close()
# r.get_sql(tables=tables,key='person_id')
# p = r.compute()
# print p
# p.to_csv("risk.csv")
# r.write('foo.sql')