7.3 KiB
7.3 KiB
None
<html lang="en">
<head>
</head>
</html>
In [66]:
import pandas as pd import numpy as np from google.cloud import bigquery as bq client = bq.Client.from_service_account_json('/home/steve/dev/google-cloud-sdk/accounts/vumc-test.json')
In [33]:
xo = ['person_id','date_of_birth','race'] xi = ['person_id','value_as_number','value_source_value']
In [53]:
def get_tables(client,did,fields=[]): """ getting table lists from google """ r = [] ref = client.dataset(id) tables = list(client.list_tables(ref)) for table in tables : ref = table.reference schema = client.get_table(ref).schema names = [f.field_name for f in schema] x = list(set(names) & set(fields)) if x : r.append({"name":table.table_id,"fields":names}) return r def get_fields(**args): """ This function will generate a random set of fields from two tables. Tables are structured as follows {name,fields:[],"y":}, with name table name (needed to generate sql query) fields list of field names, used in the projection y name of the field to be joined. @param xo candidate table in the join @param xi candidate table in the join @param join field by which the tables can be joined. """ # The set operation will remove redundancies in the field names (not sure it's a good idea) xo = args['xo']['fields'] xi = args['xi']['fields'] zi = args['xi']['name'] return list(set(xo) | set(['.'.join([args['xi']['name'],name]) for name in xi if name != args['join']]) ) def generate_sql(**args): """ This function will generate the SQL query for the resulting join """ xo = args['xo'] xi = args['xi'] sql = "SELECT :fields FROM :xo.name INNER JOIN :xi.name ON :xi.name.:xi.y = :xo.y " fields = ",".join(get_fields(xo=xi,xi=xi,join=xi['y'])) sql = sql.replace(":fields",fields).replace(":xo.name",xo['name']).replace(":xi.name",xi['name']) sql = sql.replace(":xi.y",xi['y']).replace(":xo.y",xo['y']) return sql
In [54]:
xo = {"name":"person","fields":['person_id','date_of_birth','race']} xi = {"name":"measurements","fields":['person_id','value_as_number','value_source_value']} get_fields(xo=xo,xi=xi,join="person_id")
Out[54]:
['person_id', 'measurements.value_as_number', 'date_of_birth', 'race', 'measurements.value_source_value']
In [55]:
xo = {"name":"person","fields":['person_id','date_of_birth','race'],"y":"person_id"} xi = {"name":"measurements","fields":['person_id','value_as_number','value_source_value'],"y":"person_id"} generate_sql(xo=xo,xi=xi)
Out[55]:
'SELECT person_id,value_as_number,measurements.value_source_value,measurements.value_as_number,value_source_value FROM person INNER JOIN measurements ON measurements.person_id = person_id '
In [59]:
""" We are designing a process that will take two tables that will generate """ import itertools list(itertools.combinations(['a','b','c'],2))
Out[59]:
[('a', 'b'), ('a', 'c'), ('b', 'c')]
In [87]:
ref = client.dataset('raw') tables = list(client.list_tables(ref)) names = [table.table_id for table in tables] (tables[0].reference)
Out[87]:
TableReference(DatasetReference(u'aou-res-deid-vumc-test', u'raw'), 'care_site')
In [85]:
# # find every table with person id at the very least or a subset of fields # def get_tables q = ['person_id'] pairs = list(itertools.combinations(names,len(names))) pairs[0]
Out[85]:
(u'care_site', u'concept', u'concept_ancestor', u'concept_class', u'concept_relationship', u'concept_synonym', u'condition_occurrence', u'criteria', u'death', u'device_exposure', u'domain', u'drug_exposure', u'drug_strength', u'location', u'measurement', u'note', u'observation', u'people_seed', u'person', u'procedure_occurrence', u'relationship', u'visit_occurrence', u'vocabulary')
In [90]:
list(set(['a','b']) & set(['a']))
Out[90]:
['a']
In [ ]: