12 KiB
12 KiB
None
<html lang="en">
<head>
</head>
</html>
In [8]:
""" The experiments here describe medical/family history as they associate with risk measures Additionally we will have fractional risk assessments """ import pandas as pd import numpy as np from pandas_risk import * dfm = pd.read_gbq("SELECT * FROM deid_risk.registered_medical_history_dec_001",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json') dff = pd.read_gbq("SELECT * FROM deid_risk.registered_family_history_dec_001",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json') df = pd.read_gbq("SELECT person_id, birth_date,city,state,home_owner,race,ethnicity,gender,birth_place,marital_status,orientation,education,employment_status,income,travel_abroad_6_months,active_duty_status FROM deid_risk.registered_dec_01",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
In [32]:
med_cols = np.random.choice(list(set(dfm.columns.tolist()) - set(['person_id'])),3).tolist() fam_cols = np.random.choice(list(set(dff.columns.tolist()) - set(['person_id'])),3).tolist() medical = pd.merge(df,dfm[med_cols+['person_id']],on='person_id') family = pd.merge(df,dff[fam_cols + ['person_id']],on='person_id') _tmp = pd.merge(dfm[med_cols +['person_id']],dff[fam_cols+['person_id']]) data = pd.merge(df,_tmp,on='person_id')
In [33]:
pd.concat([data.deid.evaluate(flag='full history',cols= list(set(data.columns.tolist()) - set(['person_id'])) ) ,medical.deid.evaluate(flag='medical',cols=list( set(medical.columns.tolist() ) - set(['person_id']) ) ) ,family.deid.evaluate(flag='family',cols=list( set(family.columns.tolist() ) - set(['person_id']) ) ) ,df.deid.evaluate(flag='no-history',cols=list( set(df.columns.tolist() ) - set(['person_id']) ) ) , dfm.deid.evaluate(flag='medical-only',cols=med_cols ) , dff.deid.evaluate(flag='family-only',cols=fam_cols ) ],ignore_index=True)
Out[33]:
field_count | flag | group_count | marketer | prosecutor | unique_row_ratio | |
---|---|---|---|---|---|---|
0 | 21 | full history | 115308 | 0.992691 | 1.0 | 0.987663 |
1 | 18 | medical | 115306 | 0.992674 | 1.0 | 0.987629 |
2 | 18 | family | 115304 | 0.992656 | 1.0 | 0.987594 |
3 | 15 | no-history | 115300 | 0.992622 | 1.0 | 0.987526 |
4 | 3 | medical-only | 27 | 0.000232 | 0.5 | 0.000000 |
5 | 3 | family-only | 146 | 0.001257 | 1.0 | 0.000551 |
In [2]:
from __future__ import division def evaluate(df) : cols = list(set(df.columns.tolist()) - set(['person_id'])) portions = np.round(np.random.random_sample(4),3).tolist() + np.arange(5,105,5).tolist() N = df.shape[0] - 1 portions = np.divide(np.multiply(portions,N),100).astype(np.int64) portions = np.unique([n for n in portions if n > 1]) r = pd.DataFrame() for num_rows in portions : indices = np.random.choice(N,num_rows,replace=False) # print (indices.size / N) flag = " ".join([str( np.round(100*indices.size/ N,2)),'%']) r = r.append(df.loc[indices].deid.evaluate(cols=cols,flag=flag,min_group_size=2)) return r
In [3]:
cols = list(set (df.columns.tolist()) - set(['person_id'])) df[['race','state','gender_identity','ethnicity','marital_status','education','orientation','sex_at_birth','birth_date','travel_abroad_6_months','active_duty_status']].deid.evaluate()
Out[3]:
field_count | flag | group_count | marketer | prosecutor | unique_row_ratio | |
---|---|---|---|---|---|---|
0 | 11 | UNFLAGGED | 114886 | 0.989058 | 1.0 | 0.980535 |
In [68]:
# # This is the merge with medical history cols = ['person_id'] + np.random.choice(dfm.columns[1:],3,replace=False).tolist() p = pd.merge(df,dfm[cols],on='person_id') cols # # cols = list(set(p.columns.tolist()) - set(['person_id'])) # evaluate(p) #p.deid.explore(cols=cols,num_runs=100)
Out[68]:
['person_id', 'HearingVision_FarSightedness', 'HearingVision_Glaucoma', 'Digestive_Pancreatitis']
In [7]:
cols = list( set(dfm.columns.tolist()) - set(['person_id'])) cols = np.random.choice(cols,3,replace=False).tolist() p = pd.merge(dfm[['person_id']+cols],df) fcols = list(set(p.columns.tolist()) - set(['person_id'])) # dfm[cols].deid.evaluate(cols=list( set(cols) - set(['person_id'])))
Medical History¶
We randomly select three a tributes {{ " ; ".join(cols)}} .
The dataset associated risk evaluation contains {{ p.shape[0] }} records
{{ p[fcols].deid.evaluate() }}
In [52]:
cols
Out[52]:
['person_id', 'InfectiousDiseases_Tuberculosis', 'SkeletalMuscular_Fibromyalgia', 'Cancer_ProstateCancer']
In [67]:
# dfm[cols[1:]].head() np.sum(dfm.fillna(' ').groupby(cols[1:],as_index=False).size().values <= 1)
Out[67]:
3
In [ ]: