privacykit/risk.ipynb at 886230e647cdc84fd1631356066e032edfebe58d

11 KiB

Raw Blame History

None <html lang="en"> <head> </head>

In [1]:

import pandas as pd
import numpy as np
from google.cloud import bigquery as bq

client = bq.Client.from_service_account_json('/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')
# pd.read_gbq(query="select * from raw.observation limit 10",private_key='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')
jobs = client.list_jobs()
for job in jobs :
#     print dir(job)
    print job.user_email,job.job_id,job.started, job.state
    break

dev-deid-600@aou-res-deid-vumc-test.iam.gserviceaccount.com df0ac049-d5b6-416f-ab3c-6321eda919d6 2018-09-25 08:18:34.829000+00:00 DONE

In [33]:

xo = ['person_id','date_of_birth','race']
xi = ['person_id','value_as_number','value_source_value']

In [10]:

def get_tables(client,id,fields=[]):
    """
        getting table lists from google
    """
    r = []
    ref = client.dataset(id)
    tables = list(client.list_tables(ref))
    for table in tables :
        ref = table.reference
        schema = client.get_table(ref).schema
        names = [f.name for f in schema]
        x = list(set(names) & set(fields))
        if x  :
            r.append({"name":table.table_id,"fields":names})
    return r
    
def get_fields(**args):
    """
        This function will generate a random set of fields from two tables. Tables are structured as follows 
        {name,fields:[],"y":}, with 
            name     table name (needed to generate sql query)
            fields   list of field names, used in the projection
            y        name of the field to be joined.
        @param xo candidate table in the join
        @param xi candidate table in the join
        @param join field by which the tables can be joined.
    """
    # The set operation will remove redundancies in the field names (not sure it's a good idea)
#     xo = args['xo']['fields']
#     xi = args['xi']['fields']
#     zi = args['xi']['name']
#     return list(set([ ".".join([args['xo']['name'],name]) for name in xo]) | set(['.'.join([args['xi']['name'],name]) for name in xi if name != args['join']]) )
    xo = args['xo']
    fields = [".".join([args['xo']['name'],name]) for name in args['xo']['fields']]
    if not isinstance(args['xi'],list) :
        x_ = [args['xi']]
    else:
        x_ = args['xi']
    for xi in x_ :
        fields += (['.'.join([xi['name'], name]) for name in xi['fields'] if name != args['join']])
    return fields
def generate_sql(**args):
    """
        This function will generate the SQL query for the resulting join
    """
    
    xo = args['xo']
    x_ = args['xi']
    xo_name = ".".join([args['prefix'],xo['name'] ]) if 'prefix' in args else xo['name']
    SQL = "SELECT :fields FROM :xo.name ".replace(":xo.name",xo_name)
    if not isinstance(x_,list):
        x_ = [x_]
    f = []#[".".join([args['xo']['name'],args['join']] )]    
    INNER_JOINS = []
    for xi in x_ :
        xi_name = ".".join([args['prefix'],xi['name'] ]) if 'prefix' in args else xi['name']
        JOIN_SQL = "INNER JOIN :xi.name ON ".replace(':xi.name',xi_name)
        value  = ".".join([xi['name'],args['join']])
        f.append(value) 
        
        ON_SQL = ""
        tmp = []
        for term in f :
            ON_SQL = ":xi.name.:ofield = :xo.name.:ofield".replace(":xo.name",xo['name'])
            ON_SQL = ON_SQL.replace(":xi.name.:ofield",term).replace(":ofield",args['join'])
            tmp.append(ON_SQL)
        INNER_JOINS += [JOIN_SQL + " AND ".join(tmp)]
    return SQL + " ".join(INNER_JOINS)
def get_final_sql(**args):
    xo = args['xo']
    xi = args['xi']
    join=args['join']
    prefix = args['prefix'] if 'prefix' in args else ''
    fields = get_fields (xo=xo,xi=xi,join=join)
    k = len(fields)
    n = np.random.randint(2,k) #-- number of fields to select
    i = np.random.randint(0,k,size=n)
    fields = [name for name in fields if fields.index(name) in i]
    base_sql = generate_sql(xo=xo,xi=xi,prefix)
    SQL = """
        SELECT AVERAGE(count),size,n as selected_features,k as total_features
        FROM(
            SELECT COUNT(*) as count,count(:join) as pop,sum(:n) as N,sum(:k) as k,:fields
            FROM (:sql)
        GROUP BY :fields
        ) 
        order by 1
        
    """.replace(":sql",base_sql)
#     sql = "SELECT :fields FROM :xo.name INNER JOIN :xi.name ON :xi.name.:xi.y = :xo.y "
#     fields = ",".join(get_fields(xo=xi,xi=xi,join=xi['y']))
    
    
#     sql = sql.replace(":fields",fields).replace(":xo.name",xo['name']).replace(":xi.name",xi['name'])
#     sql = sql.replace(":xi.y",xi['y']).replace(":xo.y",xo['y'])
#     return sql

In [33]:

xo = {"name":"person","fields":['person_id','date_of_birth','race','value_as_number']}
xi = [{"name":"measurement","fields":['person_id','value_as_number','value_source_value']}] #,{"name":"observation","fields":["person_id","value_as_string","observation_source_value"]}]
# generate_sql(xo=xo,xi=xi,join="person_id",prefix='raw')
fields = get_fields(xo=xo,xi=xi,join='person_id')
ofields = list(fields)
k = len(fields)
n = np.random.randint(2,k) #-- number of fields to select
i = np.random.randint(0,k,size=n)
fields = [name for name in fields if fields.index(name) in i]

In [34]:

fields

Out[34]:

['person.race', 'person.value_as_number', 'measurement.value_source_value']

In [55]:

xo = {"name":"person","fields":['person_id','date_of_birth','race'],"y":"person_id"}
xi = {"name":"measurements","fields":['person_id','value_as_number','value_source_value'],"y":"person_id"}
generate_sql(xo=xo,xi=xi)

Out[55]:

'SELECT person_id,value_as_number,measurements.value_source_value,measurements.value_as_number,value_source_value FROM person INNER JOIN measurements ON measurements.person_id = person_id '

In [59]:

"""
    We are designing a process that will take two tables that will generate 
"""
import itertools
list(itertools.combinations(['a','b','c'],2))

Out[59]:

[('a', 'b'), ('a', 'c'), ('b', 'c')]

In [6]:

#
# find every table with person id at the very least or a subset of fields
#
np.random.randint(0,4,size=4)

Out[6]:

array([1, 3, 0, 0])

In [90]:

list(set(['a','b']) & set(['a']))

Out[90]:

['a']

In [120]:

x_ = 1

In [10]:

x_ = pd.DataFrame({"group":[1,1,1,1,1], "size":[2,1,1,1,1]})

In [12]:

x_.groupby(['group']).mean()

Out[12]:

	size
group
1	1.2

In [ ]:

</html>

11 KiB Raw Blame History

11 KiB

Raw Blame History