11 KiB
11 KiB
None
<html lang="en">
<head>
</head>
</html>
In [1]:
import pandas as pd import numpy as np from google.cloud import bigquery as bq client = bq.Client.from_service_account_json('/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json') # pd.read_gbq(query="select * from raw.observation limit 10",private_key='/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json') jobs = client.list_jobs() for job in jobs : # print dir(job) print job.user_email,job.job_id,job.started, job.state break
dev-deid-600@aou-res-deid-vumc-test.iam.gserviceaccount.com df0ac049-d5b6-416f-ab3c-6321eda919d6 2018-09-25 08:18:34.829000+00:00 DONE
In [33]:
xo = ['person_id','date_of_birth','race'] xi = ['person_id','value_as_number','value_source_value']
In [10]:
def get_tables(client,id,fields=[]): """ getting table lists from google """ r = [] ref = client.dataset(id) tables = list(client.list_tables(ref)) for table in tables : ref = table.reference schema = client.get_table(ref).schema names = [f.name for f in schema] x = list(set(names) & set(fields)) if x : r.append({"name":table.table_id,"fields":names}) return r def get_fields(**args): """ This function will generate a random set of fields from two tables. Tables are structured as follows {name,fields:[],"y":}, with name table name (needed to generate sql query) fields list of field names, used in the projection y name of the field to be joined. @param xo candidate table in the join @param xi candidate table in the join @param join field by which the tables can be joined. """ # The set operation will remove redundancies in the field names (not sure it's a good idea) # xo = args['xo']['fields'] # xi = args['xi']['fields'] # zi = args['xi']['name'] # return list(set([ ".".join([args['xo']['name'],name]) for name in xo]) | set(['.'.join([args['xi']['name'],name]) for name in xi if name != args['join']]) ) xo = args['xo'] fields = [".".join([args['xo']['name'],name]) for name in args['xo']['fields']] if not isinstance(args['xi'],list) : x_ = [args['xi']] else: x_ = args['xi'] for xi in x_ : fields += (['.'.join([xi['name'], name]) for name in xi['fields'] if name != args['join']]) return fields def generate_sql(**args): """ This function will generate the SQL query for the resulting join """ xo = args['xo'] x_ = args['xi'] xo_name = ".".join([args['prefix'],xo['name'] ]) if 'prefix' in args else xo['name'] SQL = "SELECT :fields FROM :xo.name ".replace(":xo.name",xo_name) if not isinstance(x_,list): x_ = [x_] f = []#[".".join([args['xo']['name'],args['join']] )] INNER_JOINS = [] for xi in x_ : xi_name = ".".join([args['prefix'],xi['name'] ]) if 'prefix' in args else xi['name'] JOIN_SQL = "INNER JOIN :xi.name ON ".replace(':xi.name',xi_name) value = ".".join([xi['name'],args['join']]) f.append(value) ON_SQL = "" tmp = [] for term in f : ON_SQL = ":xi.name.:ofield = :xo.name.:ofield".replace(":xo.name",xo['name']) ON_SQL = ON_SQL.replace(":xi.name.:ofield",term).replace(":ofield",args['join']) tmp.append(ON_SQL) INNER_JOINS += [JOIN_SQL + " AND ".join(tmp)] return SQL + " ".join(INNER_JOINS) def get_final_sql(**args): xo = args['xo'] xi = args['xi'] join=args['join'] prefix = args['prefix'] if 'prefix' in args else '' fields = get_fields (xo=xo,xi=xi,join=join) k = len(fields) n = np.random.randint(2,k) #-- number of fields to select i = np.random.randint(0,k,size=n) fields = [name for name in fields if fields.index(name) in i] base_sql = generate_sql(xo=xo,xi=xi,prefix) SQL = """ SELECT AVERAGE(count),size,n as selected_features,k as total_features FROM( SELECT COUNT(*) as count,count(:join) as pop,sum(:n) as N,sum(:k) as k,:fields FROM (:sql) GROUP BY :fields ) order by 1 """.replace(":sql",base_sql) # sql = "SELECT :fields FROM :xo.name INNER JOIN :xi.name ON :xi.name.:xi.y = :xo.y " # fields = ",".join(get_fields(xo=xi,xi=xi,join=xi['y'])) # sql = sql.replace(":fields",fields).replace(":xo.name",xo['name']).replace(":xi.name",xi['name']) # sql = sql.replace(":xi.y",xi['y']).replace(":xo.y",xo['y']) # return sql
In [33]:
xo = {"name":"person","fields":['person_id','date_of_birth','race','value_as_number']} xi = [{"name":"measurement","fields":['person_id','value_as_number','value_source_value']}] #,{"name":"observation","fields":["person_id","value_as_string","observation_source_value"]}] # generate_sql(xo=xo,xi=xi,join="person_id",prefix='raw') fields = get_fields(xo=xo,xi=xi,join='person_id') ofields = list(fields) k = len(fields) n = np.random.randint(2,k) #-- number of fields to select i = np.random.randint(0,k,size=n) fields = [name for name in fields if fields.index(name) in i]
In [34]:
fields
Out[34]:
['person.race', 'person.value_as_number', 'measurement.value_source_value']
In [55]:
xo = {"name":"person","fields":['person_id','date_of_birth','race'],"y":"person_id"} xi = {"name":"measurements","fields":['person_id','value_as_number','value_source_value'],"y":"person_id"} generate_sql(xo=xo,xi=xi)
Out[55]:
'SELECT person_id,value_as_number,measurements.value_source_value,measurements.value_as_number,value_source_value FROM person INNER JOIN measurements ON measurements.person_id = person_id '
In [59]:
""" We are designing a process that will take two tables that will generate """ import itertools list(itertools.combinations(['a','b','c'],2))
Out[59]:
[('a', 'b'), ('a', 'c'), ('b', 'c')]
In [6]:
# # find every table with person id at the very least or a subset of fields # np.random.randint(0,4,size=4)
Out[6]:
array([1, 3, 0, 0])
In [90]:
list(set(['a','b']) & set(['a']))
Out[90]:
['a']
In [120]:
x_ = 1
In [10]:
x_ = pd.DataFrame({"group":[1,1,1,1,1], "size":[2,1,1,1,1]})
In [12]:
x_.groupby(['group']).mean()
Out[12]:
size | |
---|---|
group | |
1 | 1.2 |
In [ ]: