From 0b16ce94ccf6361899e6a225d7afdee4191979df Mon Sep 17 00:00:00 2001 From: Steve Nyemba Date: Tue, 11 Dec 2018 17:43:16 -0600 Subject: [PATCH] notebooks --- notebooks/Untitled.ipynb | 238 ++++++++++++ notebooks/data-analysis.ipynb | 2 +- notebooks/data-preparation.ipynb | 95 +++++ notebooks/experiments.ipynb | 610 +++++++++++++++++++++++++++++++ 4 files changed, 944 insertions(+), 1 deletion(-) create mode 100644 notebooks/Untitled.ipynb create mode 100644 notebooks/data-preparation.ipynb create mode 100644 notebooks/experiments.ipynb diff --git a/notebooks/Untitled.ipynb b/notebooks/Untitled.ipynb new file mode 100644 index 0000000..1e154e2 --- /dev/null +++ b/notebooks/Untitled.ipynb @@ -0,0 +1,238 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import itertools \n", + "import pandas as pd\n", + "import numpy as np\n", + "# from pandas_risk import *\n", + "from time import time\n", + "import os\n", + "\n", + "attr = ['gender','race','zip','year_of_birth']\n", + "comb_attr = [\n", + " ['zip' ,'gender', 'birth_datetime', 'race'], \n", + " ['zip', 'gender', 'year_of_birth', 'race'], \n", + " ['gender','race','zip'],\n", + " ['race','year_of_birth','zip']\n", + "]\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "SQL_CONTROLLED=\"SELECT * FROM deid_risk.basic_risk60k\"\n", + "dfc = pd.read_gbq(SQL_CONTROLLED,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def risk(**args):\n", + " Yi = args['data']\n", + " Yi = Yi.fillna(' ')\n", + " sizes = args['prop'] if 'prop' in args else np.arange(5,100,5)\n", + " FLAG = args['flag'] if 'flag' in args else 'UNFLAGGED'\n", + " N = args['num_runs']\n", + " if 'cols' in args :\n", + " columns = args['cols']\n", + " else:\n", + " columns = list(set(Yi.columns.tolist()) - set(['person_id']))\n", + " p = pd.DataFrame()\n", + " y_i= pd.DataFrame({\"group_size\":Yi.groupby(columns,as_index=False).size()}).reset_index()\n", + " for index in sizes :\n", + " for n in np.repeat(index,N):\n", + " \n", + " # we will randomly sample n% rows from the dataset\n", + " i = np.random.choice(Yi.shape[0],((Yi.shape[0] * n)/100),replace=False)\n", + " x_i= pd.DataFrame(Yi).loc[i] \n", + " risk = x_i.deid.risk(id='person_id',quasi_id = columns)\n", + " x_i = pd.DataFrame({\"group_size\":x_i.groupby(columns,as_index=False).size()}).reset_index()\n", + "\n", + "\n", + " r = pd.merge(x_i,y_i,on=columns,how='inner')\n", + " if r.shape[0] == 0 :\n", + " continue\n", + " r['marketer'] = r.apply(lambda row: (row.group_size_x / np.float64(row.group_size_y)) /np.sum(x_i.group_size) ,axis=1)\n", + " r['sample %'] = np.repeat(n,r.shape[0])\n", + " r['tier'] = np.repeat(FLAG,r.shape[0])\n", + " r['sample marketer'] = np.repeat(risk['marketer'].values[0],r.shape[0])\n", + " # r['patient_count'] = np.repeat(r.shape[0],r.shape[0])\n", + " r = r.groupby(['sample %','tier','sample marketer'],as_index=False).sum()[['sample %','marketer','sample marketer','tier']]\n", + " p = p.append(r)\n", + " p.index = np.arange(p.shape[0]).astype(np.int64)\n", + " return p\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from pandas_risk import *\n", + "o = pd.DataFrame()\n", + "PATH=\"out/experiment-phase-2.xlsx\"\n", + "writer = pd.ExcelWriter(PATH,engine='xlsxwriter')\n", + "comb_attr = [\n", + " ['zip' ,'gender', 'birth_datetime', 'race'], \n", + " ['zip', 'gender', 'year_of_birth', 'race'], \n", + " ['gender','race','zip'],\n", + " ['race','year_of_birth','zip']\n", + "]\n", + "\n", + "for cols in comb_attr :\n", + " o = risk(data=dfc,cols=cols,flag='CONTROLLED',num_runs=5)\n", + " #\n", + " # adding the policy\n", + " x = [1* dfc.columns.isin(cols) for i in range(o.shape[0])]\n", + " o = o.join(pd.DataFrame(x,columns = dfc.columns))\n", + " #\n", + " # Write this to excel notebook\n", + " o.to_excel(writer,\"-\".join(cols))\n", + "# break\n", + " \n", + "\n", + "# p = p.rename(columns={'marketer_x':'sample marketer'})\n", + "# p.index = np.arange(p.shape[0]).astype(np.int64)\n", + "\n", + "writer.save()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
person_idyear_of_birthmonth_of_birthday_of_birthbirth_datetimerace_concept_idethnicity_concept_idlocation_idcare_site_idperson_source_value...gender_source_concept_idrace_source_valueethnicity_source_valuesex_at_birthbirth_dateracezipcitystategender
\n", + "

0 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [person_id, year_of_birth, month_of_birth, day_of_birth, birth_datetime, race_concept_id, ethnicity_concept_id, location_id, care_site_id, person_source_value, gender_source_value, gender_source_concept_id, race_source_value, ethnicity_source_value, sex_at_birth, birth_date, race, zip, city, state, gender]\n", + "Index: []\n", + "\n", + "[0 rows x 21 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "x = [1* dfc.columns.isin(cols) for i in range(o.shape[0])]\n", + "o.join(pd.DataFrame(x,columns = dfc.columns))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'columns' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcolumns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'columns' is not defined" + ] + } + ], + "source": [ + "columns\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.15rc1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/data-analysis.ipynb b/notebooks/data-analysis.ipynb index d7c44c2..8e4e21b 100644 --- a/notebooks/data-analysis.ipynb +++ b/notebooks/data-analysis.ipynb @@ -177,7 +177,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.10" + "version": "2.7.15rc1" }, "varInspector": { "cols": { diff --git a/notebooks/data-preparation.ipynb b/notebooks/data-preparation.ipynb new file mode 100644 index 0000000..adbd66e --- /dev/null +++ b/notebooks/data-preparation.ipynb @@ -0,0 +1,95 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " skiping ...\n", + " skiping ...\n", + " skiping ...\n", + " skiping ...\n", + " skiping ...\n", + " skiping ...\n", + " skiping ...\n" + ] + }, + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"\n", + " This notebook is designed to generate SQL syntax all the quasi-identifiers for the patients in the database\n", + " The resulting SQL will be run against bigquery to produce a table with every record mapping to a patient\n", + " \n", + "\"\"\"\n", + "\n", + "from risk import *\n", + "ihandle = UtilHandler(path='/home/steve/dev/google-cloud-sdk/accounts/curation-prod.json',dataset='combined20180822',key_field='person_id',key_table='person',filter=['person','observation'])\n", + "r = ihandle.migrate_tables()\n", + "len(r)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "u' SELECT person.person_id , person.year_of_birth , person.month_of_birth , person.day_of_birth , person.birth_datetime , person.race_concept_id , person.ethnicity_concept_id , person.location_id , person.care_site_id , person.person_source_value , person.gender_source_value , person.gender_source_concept_id , person.race_source_value , person.ethnicity_source_value , basic_observation.sex_at_birth AS sex_at_birth1 , basic_observation.birth_date AS birth_date1 , basic_observation.race AS race1 , basic_observation.zip AS zip1 , basic_observation.city AS city1 , basic_observation.state AS state1 , basic_observation.gender AS gender1 FROM (select * from deid_image.person ) as person INNER JOIN (select * from deid_image.basic_observation ) as basic_observation ON basic_observation.person_id = person.person_id '" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ihandle = UtilHandler(path='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',dataset='deid_image',key_field='person_id',key_table='person',filter=['person','basic_observation'])\n", + "ihandle.create_table().replace('\\n',' ')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.15rc1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/experiments.ipynb b/notebooks/experiments.ipynb new file mode 100644 index 0000000..3d52a33 --- /dev/null +++ b/notebooks/experiments.ipynb @@ -0,0 +1,610 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + " Health Information Privacy Lab\n", + " This notebook is intended to run experiments and generate the data to be used by another notebook\n", + "\n", + " pre-requisites:\n", + " - pandas_risk This is a custom framework that will compute risk for a given dataset\n", + " - google-cloud-bigquery\n", + " - numpy\n", + "\"\"\"\n", + "import pandas as pd\n", + "import numpy as np\n", + "from pandas_risk import *\n", + "from time import time\n", + "import os\n", + "#\n", + "#-- Loading the dataset\n", + "class Logger :\n", + " cache = []\n", + " @staticmethod\n", + " def clear():\n", + " Logger.cache = []\n", + " @staticmethod\n", + " def log(**args) :\n", + " Logger.cache.append(args)\n", + " \n", + "SQL_CONTROLLED=\"SELECT person_id,birth_datetime,city,zip,state,race,gender FROM deid_risk.basic_risk60k\"\n", + "SQL_REGISTERED = \"SELECT person_id,birth_datetime,city,zip,state,race,gender FROM deid_risk.basic_deid_risk60k\"\n", + "dfr = pd.read_gbq(SQL_REGISTERED,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')\n", + "dfc = pd.read_gbq(SQL_CONTROLLED,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample %marketersample marketertier
050.9749450.981364controlled
150.9755130.981996controlled
250.9757980.980733controlled
350.9763640.981996controlled
450.9763640.981996controlled
\n", + "
" + ], + "text/plain": [ + " sample % marketer sample marketer tier\n", + "0 5 0.974945 0.981364 controlled\n", + "1 5 0.975513 0.981996 controlled\n", + "2 5 0.975798 0.980733 controlled\n", + "3 5 0.976364 0.981996 controlled\n", + "4 5 0.976364 0.981996 controlled" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "#\n", + "FLAG='REGISTERED-TIER-1'\n", + "if FLAG == 'REGISTERED-TIER' :\n", + " Yi = pd.DataFrame(dfr)\n", + " FOLDER='registered'\n", + "else:\n", + " Yi = pd.DataFrame(dfc)\n", + " FOLDER='controlled'\n", + "Yi = Yi.fillna(' ')\n", + "N = 5\n", + "N_ = str(N)\n", + "SUFFIX = FOLDER+'-tier-'+str(N)+'-experiment.xlsx'\n", + "PATH = os.sep.join(['out',SUFFIX])\n", + "\n", + "\n", + "columns = list(set(Yi.columns.tolist()) - set(['person_id']))\n", + "merged_columns = list(columns)+['field_count']\n", + "m = {}\n", + "p = pd.DataFrame()\n", + "n = 0\n", + "y_i= pd.DataFrame({\"group_size\":Yi.groupby(columns,as_index=False).size()}).reset_index()\n", + "#.deid.risk(id='person_id',quasi_id=columns)\n", + "for index in np.arange(5,105,5):\n", + " for n in np.repeat(index,N) :\n", + "# np.random.seed( np.random.randint(0,int(time())+np.random.randint(0,1000)+index+n ) \n", + " #\n", + " # we will randomly sample n% rows from the dataset\n", + " i = np.random.choice(Yi.shape[0],((Yi.shape[0] * n)/100),replace=False)\n", + " x_i= pd.DataFrame(Yi).loc[i] \n", + " risk = x_i.deid.risk(id='person_id',quasi_id = columns)\n", + " x_i = pd.DataFrame({\"group_size\":x_i.groupby(columns,as_index=False).size()}).reset_index()\n", + " \n", + "# y_i= pd.DataFrame(Yi).deid.risk(id='person_id',quasi_id=columns)\n", + "\n", + "\n", + " r = pd.merge(x_i,y_i,on=columns,how='inner')\n", + " if r.shape[0] == 0 :\n", + " print 'skipping ',n\n", + " continue\n", + " r['marketer'] = r.apply(lambda row: (row.group_size_x / np.float64(row.group_size_y)) /np.sum(x_i.group_size) ,axis=1)\n", + " r['sample %'] = np.repeat(n,r.shape[0])\n", + " r['tier'] = np.repeat(FOLDER,r.shape[0])\n", + " r['sample marketer'] = np.repeat(risk['marketer'].values[0],r.shape[0])\n", + "# r['patient_count'] = np.repeat(r.shape[0],r.shape[0])\n", + " r = r.groupby(['sample %','tier','sample marketer'],as_index=False).sum()[['sample %','marketer','sample marketer','tier']]\n", + "# r['marketer'] = r.apply(lambda row: (row.group_size_x / row.group_size_y) / row.patient_count_x,axis=1 )\n", + "# r = r.groupby(columns+['marketer_x'],as_index=False).sum()[columns+['marketer','marketer_x']]\n", + "# r['sample %'] = np.repeat(n,r.shape[0])\n", + "# r['tier'] = np.repeat(FOLDER,r.shape[0])\n", + " p = p.append(r)\n", + "\n", + "writer = pd.ExcelWriter(PATH,engine='xlsxwriter')\n", + "p = p.rename(columns={'marketer_x':'sample marketer'})\n", + "p.index = np.arange(p.shape[0]).astype(np.int64)\n", + "p.to_excel(writer,FOLDER)\n", + "writer.save()\n", + "p.head() " + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "p.plot(kind='scatter',x='sample %',y='marketer', c = '#4682B4')\n", + "p.plot(kind='scatter',x='sample %',y = 'sample marketer', c='#CC0000')\n", + "ax = p.plot(kind='scatter',x='sample %',y='marketer', c = '#4682B4')\n", + "p.plot(kind='scatter',x='sample %',y = 'sample marketer', c='#CC0000',ax=ax)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + " This experiment consists in :\n", + " 1: randomly selecting x % of the records to be sampled\n", + " 2: running a group by on the sample\n", + " 3: calling groupby on the population which th\n", + "\"\"\"\n", + "SQL_ORIGINAL=\"SELECT * FROM deid_risk.risk_60k2\"\n", + "SQL_DEID = \"SELECT * FROM deid_risk.deid_risk_60k limit 20000\"\n", + "# df = pd.read_gbq(SQL_DEID,private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')\n", + "\n", + "#\n", + "FLAG='REGISTERED-TIER-9'\n", + "if FLAG == 'REGISTERED-TIER' :\n", + " Yi = pd.DataFrame(dfr)\n", + " FOLDER='registered'\n", + "else:\n", + " Yi = pd.DataFrame(dfc)\n", + " FOLDER='controlled'\n", + "N = 20\n", + "N_ = str(N)\n", + "SUFFIX = FOLDER+'-tier-'+str(N)+'-experiment.xlsx'\n", + "PATH = os.sep.join(['out',SUFFIX])\n", + "\n", + "\n", + "columns = list(set(Yi.columns.tolist()) - set(['person_id']))\n", + "merged_columns = list(columns)+['field_count']\n", + "m = {}\n", + "p = pd.DataFrame()\n", + "n = 0\n", + "y_i= pd.DataFrame(Yi).deid.risk(id='person_id',quasi_id=columns)\n", + "for index in np.arange(5,105,5):\n", + "# np.random.seed( int(time())+np.random.randint(0,100)+index ) \n", + "# n = np.random.randint(10,35) #-- randomly pick a number within an interval\n", + " \n", + " for n in np.repeat(index,20) :\n", + "# np.random.seed( np.random.randint(0,int(time())+np.random.randint(0,1000)+index+n ) \n", + " #\n", + " # we will randomly sample n% rows from the dataset\n", + " i = np.random.choice(Yi.shape[0],((Yi.shape[0] * n)/100),replace=False)\n", + " x_i= pd.DataFrame(Yi).loc[i].deid.risk(id='person_id',quasi_id = columns)\n", + " \n", + "# y_i= pd.DataFrame(Yi).deid.risk(id='person_id',quasi_id=columns)\n", + "\n", + "\n", + " r = pd.merge(x_i,y_i,on=merged_columns,how='inner')\n", + " if r.shape[0] == 0 :\n", + " print 'skipping ',n\n", + " continue\n", + "\n", + " r['marketer'] = r.apply(lambda row: (row.group_size_x / row.group_size_y) / row.patient_count_x,axis=1 )\n", + " r = r.groupby(columns+['marketer_x'],as_index=False).sum()[columns+['marketer','marketer_x']]\n", + " r['sample %'] = np.repeat(n,r.shape[0])\n", + " r['tier'] = np.repeat(FOLDER,r.shape[0])\n", + " p = p.append(r)\n", + "\n", + "writer = pd.ExcelWriter(PATH,engine='xlsxwriter')\n", + "p = p.rename(columns={'marketer_x':'sample marketer'})\n", + "p.index = np.arange(p.shape[0]).astype(np.int64)\n", + "p.to_excel(writer,FOLDER)\n", + "writer.save()\n", + "p.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "ax = p.plot(kind='scatter',x='sample %',y='marketer',c='r',ylim=[p.marketer.min(),p.marketer.max()])\n", + "p.plot(kind='scatter',x='sample %',y='sample marketer',c='#4682B4')\n", + "ax = p.plot(kind='scatter',x='sample %',y='marketer',c='r')\n", + "p.plot(kind='scatter',x='sample %',y='sample marketer',c='#4682B4',ax=ax)\n", + "\n", + "_p = pd.DataFrame(p)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "p.head()\n", + "\n", + "# writer = pd.ExcelWriter('out/foo.xlsx',engine='xlsxwriter')\n", + "# workbook = writer.book\n", + "# r.groupby('field_count',as_index=False).sum()[['field_count','marketer_x']].to_excel(writer,'page-0')\n", + "# chart = workbook.add_chart({'type':'line'})\n", + "# o = r.groupby('field_count',as_index=False).sum()[['field_count','marketer_x']]\n", + "# # values = o.marketer_x.tolist()\n", + "# # values = [['page-0',item] for item in values]\n", + "# # chart.add_series({\"values\":values})\n", + "# # chart.add_series({'values':'=page-0!$B$2:$B$5'})\n", + "\n", + "# worksheet = writer.sheets['page-0']\n", + "# worksheet.insert_chart('G2',chart)\n", + "# writer.save()\n", + "\n", + "str(10)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "help(chart.add_series)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cols = list(set(dfr.columns.tolist()) - set(['person_id'])) + ['field_count']\n", + "r = pd.merge(x_i,y_i,on=cols,how='inner')\n", + "r['marketer'] = r.apply(lambda row: (row.group_count_x/row.group_count_y)/row.patient_count_y ,axis=1)\n", + "# r['field_count'] = r['field_count_x']\n", + "o = r.groupby(cols,as_index=False).sum()[cols+['marketer']]\n", + "o.groupby(['field_count'],as_index=False).mean()\n", + "# o.groupby('field_count',as_index=False).mean().plot.line(x='field_count',y='marketer')\n", + "# r.head()\n", + "# N = r.patient_count_y.mean()\n", + "# r['marketer'] = r.apply(lambda row: row.group_count_x / row.group_count_y,axis=1)\n", + "# m = r.groupby(['field_count'],as_index=False).mean()[['field_count','marketer']]\n", + "# m.marketer = m.marketer / N\n", + "# m.groupby(['field_count']).mean().plot.line()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "p.to_csv('out/x-2/single-runs-deid.csv',index=False)\n", + "p.groupby(['sample %']).mean()['marketer'].plot.line()\n", + "p.groupby(['sample %'],as_index=False).mean().plot.scatter(x='sample %',y='marketer')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y = pd.DataFrame({\"name\":['d','e','f','g'],\"age\":[12,40,20,30],\"income\":[100,200,300,400]})\n", + "x = pd.DataFrame({\"name\":['a','b','c'],\"age\":[10,20,40],\"income\":[120,100,200]})\n", + "\n", + "# x.join(y,how='outer',on='age')\n", + "x_ = pd.merge(x,y,on=['age','income'],how='outer')\n", + "Logger.log(action='merge',value=x_.shape)\n", + "Logger.cache" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# EXP_0\n", + "# Running the experiment on the Original dataset, with all the attributes\n", + "SCHEMA = \"deid_risk\"\n", + "df = pd.read_gbq(\"select person_id,birth_datetime,race,gender,sex_at_birth, city,state,zip from deid_risk.basic_risk60k \",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n", + " dialect='standard')\n", + "\n", + "RUNS = 500\n", + "FLAG = 'basic-features'\n", + "r = df.deid.risk(id='person_id',num_runs=RUNS) #,field_count=11)\n", + "# r.to_csv('out/pandas-60k-'+FLAG+'-patients-'+str(RUNS)+'-x-runs.csv')\n", + "compiled = r.groupby('field_count',as_index=False)['marketer','prosecutor'].mean()\n", + "fi = compiled[['marketer','prosecutor']].plot.line().get_figure()\n", + "# fo\n", + "# r.plot.line(x='field_count',y='marketer')\n", + "compiled = r.groupby('field_count',as_index=False)['field_count','marketer','prosecutor'].mean()\n", + "fig_i = r.plot.scatter(x='field_count',y='marketer').get_figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# EXP_2 :\n", + "# This experiment will run the marketer risk against individual attributes\n", + "deid_df = pd.read_gbq(\"select person_id,birth_datetime,race,gender,sex_at_birth, city,state,zip from deid_risk.basic_deid_risk60k\",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n", + " dialect='standard')\n", + "RUNS = 500\n", + "FLAG = 'basic-deid-features'\n", + "deid_r = deid_df.deid.risk(id='person_id',num_runs=RUNS) #,field_count=11)\n", + "# r.to_csv('out/pandas-60k-'+FLAG+'-patients-'+str(RUNS)+'-x-runs.csv')\n", + "deid_compiled = deid_r.groupby('field_count',as_index=False)['marketer','prosecutor'].mean()\n", + "fo = deid_compiled[['marketer','prosecutor']].plot.line().get_figure()\n", + "# fo\n", + "# r.plot.line(x='field_count',y='marketer')\n", + "# deid_compiled = deid_r.groupby('field_count',as_index=False)['field_count','marketer','prosecutor'].mean()\n", + "fig_o = deid_r.plot.scatter(x='field_count',y='marketer').get_figure()\n", + "\n", + "# orig_df = pd.read_gbq(\"select * from deid_risk.risk_60k2\",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n", + "# dialect='standard')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# deid_r.to_csv('out/basic-attributes-deid-data-60k-patients.csv')\n", + "# r.to_csv('out/basic-attributes-raw-data-60k-patients.csv')\n", + "# deid_r.head()\n", + "p = pd.DataFrame()\n", + "p = deid_df.deid.risk(id='person_id',quasi_id=['birth_datetime','race','gender','sex_at_birth', 'city','state','zip'])\n", + "p = p.append(df.deid.risk(id='person_id',quasi_id=['birth_datetime','race','gender','sex_at_birth', 'city','state','zip']))\n", + "p.index = ['deid data','raw data']\n", + "p.to_csv('out/basic_run-7-fields.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cols = deid_r.columns[5:]\n", + "deid_r.index = np.arange(deid_r.shape[0]).astype(np.int64)\n", + "xdeid_ = deid_r[cols].sum().tolist()\n", + "xraw_ = r[cols].sum().tolist()\n", + "o = pd.DataFrame()\n", + "o['name'] = cols\n", + "o['raw'] = xraw_\n", + "o['deid']= xdeid_\n", + "\n", + "\n", + "o\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "columns = list( set(orig_df.columns) - set(['person_id']))\n", + "xo = pd.DataFrame()\n", + "xi = pd.DataFrame()\n", + "#\n", + "# Let's compute the risk for every attribute given the list of attributes we've gathered\n", + "#\n", + "for name in columns :\n", + " xo = xo.append(deid_df.deid.risk(id='person_id',quasi_id=[name])[['marketer','prosecutor']],sort=False)\n", + " xi = xi.append(orig_df.deid.risk(id='person_id',quasi_id=[name])[['marketer','prosecutor']],sort=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# The following shows how much the deid process has affected each attributes\n", + "#\n", + "\n", + "RISK_THRESHOLD = 0.5\n", + "xo.index = columns\n", + "xi.index = columns\n", + "\n", + "ii = xi[xi.marketer > RISK_THRESHOLD].index\n", + "# zo = pd.concat([xi.loc[ii],xo.loc[ii]])\n", + "\n", + "zo = xi.loc[ii].join(xo.loc[ii],rsuffix='_deid')\n", + "#\n", + "# heatmap for original data\n", + "# fig_o = sns.heatmap(xi.loc[ii], cmap='RdYlGn_r', linewidths=0.5, annot=True).get_figure()\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# Running the experiment on the DEID dataset, with all the attributes\n", + "#\n", + "df = pd.read_gbq(\"select * from deid_risk.deid_risk_60k\",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json',\n", + " dialect='standard')\n", + "\n", + "RUNS = 1500\n", + "FLAG = 'deid-full-attr-dataset'\n", + "r = df.deid.risk(id='person_id',num_runs=RUNS) #,field_count=11)\n", + "# r.to_csv('out/pandas-60k-'+FLAG+'-patients-'+str(RUNS)+'-x-runs.csv')\n", + "compiled = r.groupby('field_count',as_index=False)['marketer','prosecutor'].mean()\n", + "fo = compiled[['marketer','prosecutor']].plot.line().get_figure()\n", + "# fo\n", + "# r.plot.line(x='field_count',y='marketer')\n", + "compiled = r.groupby('field_count',as_index=False)['field_count','marketer','prosecutor'].mean()\n", + "fig_o = r.plot.scatter(x='field_count',y='marketer').get_figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "r.groupby('field_count',as_index=False)['marketer','prosecutor'].var()[['marketer','prosecutor']].plot.line()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#\n", + "# We are going to look into the attributes with a risk of a given threshold\n", + "# We will run the experiment (varied combinations of the list of attributes)\n", + "# The experiment is intended to capture the attributes responsible for increasing the marketer risk\n", + "#\n", + "DEID_DATASET = 'deid_risk.deid_risk_60k2'\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.15rc1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}