33 KiB
33 KiB
None
<html lang="en">
<head>
</head>
</html>
In [9]:
""" This notebook is designed to run experiments around demographics on registered tier The """ import pandas as pd import numpy as np from pandas_risk import * ATTRIBUTES = ['race','ethnicity','birth_date','state','city','zip','marital_status','education','language','home_owner','income','employment_status','living_situation','active_duty_status','gender_identity','birth_place','death_date','death_cause','orientation'] dfs = pd.read_csv('scenario-settings.csv') dfc = pd.read_gbq("SELECT * FROM deid_risk.registered_dec_01",private_key='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json')
In [10]:
cols_o = dfs.loc[(dfs.fo & dfs.fi) ==1].feature.tolist() cols_i = dfs.loc[(dfs.fo + dfs.fi )>=1 ].feature.tolist() cols_a = dfs.feature.tolist() cols_v = ['birth_date','gender_identity','race','state','city','birth_place'] #-- voter registration #remove the dates fields because dates are shifted cols_o = [i for i in cols_o if i not in ['birth_date','death_date']] cols_i = [i for i in cols_i if i not in ['birth_date','death_date']] cols_a = [i for i in cols_a if i not in ['birth_date','death_date']] cols_v = [i for i in cols_v if i not in ['birth_date', 'death_date']]
In [11]:
# print(dfs) # print(cols_o) # print(cols_i)
In [12]:
r = pd.concat([dfc[cols_o].deid.evaluate(),dfc[cols_i].deid.evaluate(),dfc[cols_a].deid.evaluate(),dfc[cols_v].deid.evaluate() ]) r.index = np.arange(r.shape[0]).astype(np.int64) r['flag']=['high-conj','high-disj','all','voter-reg'] r
Out[12]:
field_count | flag | group_count | marketer | prosecutor | unique_row_ratio | |
---|---|---|---|---|---|---|
0 | 8 | high-conj | 6532 | 0.056234 | 1.0 | 0.021368 |
1 | 11 | high-disj | 47447 | 0.408473 | 1.0 | 0.278554 |
2 | 16 | all | 60718 | 0.522724 | 1.0 | 0.408189 |
3 | 5 | voter-reg | 1316 | 0.011329 | 1.0 | 0.002944 |
In [14]:
fig_o = r.plot(kind='bar',x='flag',y=['marketer']).get_figure()
In [15]:
writer = pd.ExcelWriter('out-116kpatients-phase-1.xlsx',engine='xlsxwriter') r.to_excel(writer,'phase-1') writer.save()
In [19]:
dfs
Out[19]:
feature | fi | fo | |
---|---|---|---|
0 | race | 1 | 1 |
1 | ethnicity | 1 | 1 |
2 | birth_date | 1 | 1 |
3 | city | 1 | 1 |
4 | state | 1 | 1 |
5 | marital_status | 1 | 1 |
6 | education | 1 | 0 |
7 | language | 0 | 0 |
8 | home_owner | 1 | 1 |
9 | income | 0 | 1 |
10 | employment_status | 1 | 0 |
11 | living_situation | 0 | 0 |
12 | active_duty_status | 0 | 0 |
13 | gender_identity | 1 | 1 |
14 | birth_place | 0 | 0 |
15 | death_date | 1 | 1 |
16 | death_cause | 1 | 1 |
17 | orientation | 0 | 0 |
In [38]:
import pandas as pd import numpy as np names = pd.read_csv('family-history.csv').name.tolist() path ='/home/steve/dev/google-cloud-sdk/accounts/curation-test.json' sql = """ SELECT * FROM deid_risk.registered_medical_history_dec_001 """ dfm = pd.read_gbq("SELECT * FROM deid_risk.registered_medical_history_dec_001",private_key=path,dialect='standard')
In [69]:
cols = list( set(dfm.columns.tolist()) - set(['person_id'])) r = pd.DataFrame(dfm[cols].count(),columns=['counts']) r['attributes'] = r.index r['rate'] = 100*(r.counts / dfm.shape[0]) r.rate.mean(),np.sqrt(r.rate.var())
Out[69]:
(0.9343780009344719, 1.269831148073964)
In [81]:
writer = pd.ExcelWriter('/home/steve/tmp/simple.xlsx', engine='xlsxwriter') r.to_excel(writer,sheet_name='p1') workbook = writer.book worksheet = workbook.add_worksheet() b = pd.DataFrame({"id":np.random.choice(10,30)})
In [80]:
dir(worksheet)
Out[80]:
['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_assemble_xml_file', '_button_params', '_calculate_spans', '_calculate_x_split_width', '_check_dimensions', '_comment_params', '_convert_date_time', '_convert_name_area', '_csv_join', '_encode_password', '_escape_attributes', '_escape_data', '_escape_url', '_extract_filter_tokens', '_get_palette_color', '_get_range_data', '_initialize', '_isinf', '_isnan', '_opt_close', '_opt_reopen', '_parse_filter_expression', '_parse_filter_tokens', '_position_object_emus', '_position_object_pixels', '_prepare_chart', '_prepare_header_image', '_prepare_header_vml_objects', '_prepare_image', '_prepare_shape', '_prepare_tables', '_prepare_vml_objects', '_set_filehandle', '_set_icon_props', '_set_spark_color', '_set_xml_writer', '_size_col', '_size_row', '_sort_pagebreaks', '_table_function_to_formula', '_write', '_write_array_formula', '_write_auto_filter', '_write_autofilters', '_write_blank', '_write_boolean', '_write_brk', '_write_cell', '_write_cell_array_formula', '_write_cell_value', '_write_cf_rule', '_write_cfvo', '_write_col_breaks', '_write_col_info', '_write_color', '_write_color_axis', '_write_color_first', '_write_color_high', '_write_color_last', '_write_color_low', '_write_color_markers', '_write_color_negative', '_write_color_scale', '_write_color_series', '_write_cols', '_write_conditional_formats', '_write_conditional_formatting', '_write_conditional_formatting_2010', '_write_custom_filter', '_write_custom_filters', '_write_data_bar', '_write_data_bar_ext', '_write_data_validation', '_write_data_validations', '_write_datetime', '_write_dimension', '_write_drawing', '_write_drawings', '_write_empty_row', '_write_ext', '_write_ext_list', '_write_ext_list_data_bars', '_write_ext_list_sparklines', '_write_filter', '_write_filter_column', '_write_filters', '_write_font', '_write_formula', '_write_formula_1', '_write_formula_2', '_write_formula_element', '_write_freeze_panes', '_write_header_footer', '_write_hyperlink_external', '_write_hyperlink_internal', '_write_hyperlinks', '_write_icon_set', '_write_legacy_drawing', '_write_legacy_drawing_hf', '_write_merge_cell', '_write_merge_cells', '_write_number', '_write_odd_footer', '_write_odd_header', '_write_optimized_sheet_data', '_write_outline_pr', '_write_page_margins', '_write_page_set_up_pr', '_write_page_setup', '_write_panes', '_write_phonetic_pr', '_write_print_options', '_write_rich_string', '_write_row', '_write_row_breaks', '_write_rows', '_write_rstring_color', '_write_selection', '_write_selections', '_write_sheet_data', '_write_sheet_format_pr', '_write_sheet_pr', '_write_sheet_protection', '_write_sheet_view', '_write_sheet_views', '_write_single_row', '_write_spark_color', '_write_sparkline_group', '_write_sparkline_groups', '_write_sparklines', '_write_split_panes', '_write_string', '_write_tab_color', '_write_table_part', '_write_table_parts', '_write_token_as_string', '_write_underline', '_write_url', '_write_vert_align', '_write_worksheet', '_write_x14_axis_color', '_write_x14_border_color', '_write_x14_cf_rule', '_write_x14_cfvo', '_write_x14_data_bar', '_write_x14_negative_border_color', '_write_x14_negative_fill_color', '_xml_close', '_xml_data_element', '_xml_declaration', '_xml_empty_tag', '_xml_empty_tag_unencoded', '_xml_end_tag', '_xml_formula_element', '_xml_inline_string', '_xml_number_element', '_xml_rich_inline_string', '_xml_rich_si_element', '_xml_si_element', '_xml_start_tag', '_xml_start_tag_unencoded', '_xml_string_element', 'activate', 'active', 'active_pane', 'add_sparkline', 'add_table', 'autofilter', 'autofilter_area', 'autofilter_ref', 'black_white', 'buttons_list', 'center_horizontally', 'center_vertically', 'charts', 'col_formats', 'col_size_changed', 'col_sizes', 'colinfo', 'comments', 'comments_author', 'comments_list', 'comments_visible', 'cond_formats', 'conditional_format', 'constant_memory', 'data_bars_2010', 'data_validation', 'date_1904', 'default_col_pixels', 'default_date_format', 'default_row_height', 'default_row_pixels', 'default_row_zeroed', 'default_url_format', 'dim_colmax', 'dim_colmin', 'dim_rowmax', 'dim_rowmin', 'draft_quality', 'drawing', 'drawing_links', 'dxf_priority', 'escapes', 'excel2003_style', 'excel_version', 'ext_sheets', 'external_comment_links', 'external_drawing_links', 'external_hyper_links', 'external_table_links', 'external_vml_links', 'fh', 'fileclosed', 'filter_cols', 'filter_column', 'filter_column_list', 'filter_on', 'filter_range', 'filter_type', 'fit_height', 'fit_page', 'fit_to_pages', 'fit_width', 'footer', 'footer_images', 'freeze_panes', 'get_name', 'has_comments', 'has_header_vml', 'has_vml', 'hbreaks', 'hcenter', 'header', 'header_footer_aligns', 'header_footer_changed', 'header_footer_scales', 'header_images', 'header_images_list', 'hidden', 'hide', 'hide_gridlines', 'hide_row_col_headers', 'hide_zero', 'hlink_count', 'hlink_refs', 'horizontal_dpi', 'hyperlinks', 'images', 'index', 'insert_button', 'insert_chart', 'insert_image', 'insert_textbox', 'internal_fh', 'is_chartsheet', 'is_right_to_left', 'last_shape_id', 'leading_zeros', 'margin_bottom', 'margin_footer', 'margin_header', 'margin_left', 'margin_right', 'margin_top', 'merge', 'merge_range', 'name', 'names', 'nan_inf_to_errors', 'orientation', 'original_row_height', 'outline_below', 'outline_changed', 'outline_col_level', 'outline_on', 'outline_right', 'outline_row_level', 'outline_settings', 'outline_style', 'page_order', 'page_setup_changed', 'page_start', 'page_view', 'palette', 'panes', 'paper_size', 'previous_row', 'print_across', 'print_area', 'print_area_range', 'print_comments', 'print_gridlines', 'print_headers', 'print_options_changed', 'print_row_col_headers', 'print_scale', 'protect', 'protect_options', 'rel_count', 'remove_timezone', 'repeat_col_range', 'repeat_columns', 'repeat_row_range', 'repeat_rows', 'right_to_left', 'row_col_headers', 'row_data_fh', 'row_data_fh_closed', 'row_data_filename', 'row_size_changed', 'row_sizes', 'row_spans', 'rstring', 'screen_gridlines', 'select', 'selected', 'selections', 'set_cols', 'set_column', 'set_comments_author', 'set_default_row', 'set_first_sheet', 'set_footer', 'set_h_pagebreaks', 'set_header', 'set_landscape', 'set_margins', 'set_page_view', 'set_paper', 'set_portrait', 'set_print_scale', 'set_row', 'set_rows', 'set_selection', 'set_start_page', 'set_tab_color', 'set_v_pagebreaks', 'set_vba_name', 'set_zoom', 'shape_hash', 'shapes', 'show_comments', 'show_zeros', 'sparklines', 'split_panes', 'str_table', 'strings_to_formulas', 'strings_to_numbers', 'strings_to_urls', 'tab_color', 'table', 'tables', 'tmpdir', 'use_data_bars_2010', 'validations', 'vba_codename', 'vbreaks', 'vcenter', 'vertical_dpi', 'vml_data_id', 'vml_drawing_links', 'vml_header_id', 'vml_shape_id', 'worksheet_meta', 'write', 'write_array_formula', 'write_blank', 'write_boolean', 'write_column', 'write_comment', 'write_datetime', 'write_formula', 'write_match', 'write_number', 'write_rich_string', 'write_row', 'write_string', 'write_url', 'xls_colmax', 'xls_rowmax', 'xls_strmax', 'zoom', 'zoom_scale_normal']
In [ ]: