bug fix: finalize to remove duplicate keys
This commit is contained in:
parent
3eb28dd798
commit
94798fd9a2
121
pipeline.py
121
pipeline.py
|
@ -268,7 +268,48 @@ class Components :
|
||||||
else:
|
else:
|
||||||
writer.write(_df[columns],table=args['from'])
|
writer.write(_df[columns],table=args['from'])
|
||||||
|
|
||||||
# @staticmethod
|
def finalize(self,args):
|
||||||
|
"""
|
||||||
|
This function performs post-processing opertions on a synthetic table i.e :
|
||||||
|
- remove duplicate keys
|
||||||
|
- remove orphaned keys i.e
|
||||||
|
"""
|
||||||
|
reader = factory.instance(**args['store']['source'])
|
||||||
|
logger = factory.instance(**args['store']['logs'])
|
||||||
|
target = args['store']['target']['args']['dataset']
|
||||||
|
source = args['store']['source']['args']['dataset']
|
||||||
|
table = args['from']
|
||||||
|
schema = reader.meta(table=args['from'])
|
||||||
|
#
|
||||||
|
# keys :
|
||||||
|
unique_field = "_".join([args['from'],'id']) if 'unique_fields' not in args else args['unique_fields']
|
||||||
|
fields = [ item.name if item.name != unique_field else "y."+item.name for item in schema]
|
||||||
|
SQL = [
|
||||||
|
"SELECT :fields FROM ",
|
||||||
|
"(SELECT ROW_NUMBER() OVER() AS row_number,* FROM :target.:table) x","INNER JOIN",
|
||||||
|
"(SELECT ROW_NUMBER() OVER() AS row_number, :unique_field FROM :source.:table) y",
|
||||||
|
"ON y.row_number = x.row_number"
|
||||||
|
]
|
||||||
|
SQL = " ".join(SQL).replace(":fields",",".join(fields)).replace(":table",table).replace(":source",source).replace(":target",target)
|
||||||
|
SQL = SQL.replace(":unique_field",unique_field)
|
||||||
|
#
|
||||||
|
# Use a native job to get this done ...
|
||||||
|
#
|
||||||
|
client = bq.Client.from_service_account_json(args['store']['source']['args']["private_key"])
|
||||||
|
job = bq.QueryJobConfig()
|
||||||
|
job.destination = client.dataset(target).table(table)
|
||||||
|
job.use_query_cache = True
|
||||||
|
job.allow_large_results = True
|
||||||
|
# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
|
||||||
|
job.write_disposition = "WRITE_TRUNCATE"
|
||||||
|
job.priority = 'BATCH'
|
||||||
|
r = client.query(SQL,location='US',job_config=job)
|
||||||
|
logger.write({"job":r.job_id,"action":"finalize", "args":{"sql":SQL,"source":"".join([source,table]),"destimation":".".join([target,table])}})
|
||||||
|
#
|
||||||
|
# Keep a log of what just happened...
|
||||||
|
#
|
||||||
|
otable = ".".join([args['store']['source']['args']['dataset'],args['from']])
|
||||||
|
dtable = ".".join([args['store']['target']['args']['dataset'],args['from']])
|
||||||
def generate(self,args):
|
def generate(self,args):
|
||||||
"""
|
"""
|
||||||
This function will generate data and store it to a given,
|
This function will generate data and store it to a given,
|
||||||
|
@ -527,18 +568,7 @@ if __name__ == '__main__' :
|
||||||
# @TODO:
|
# @TODO:
|
||||||
# Log what was initiated so we have context of this processing ...
|
# Log what was initiated so we have context of this processing ...
|
||||||
#
|
#
|
||||||
# if 'listen' not in SYS_ARGS :
|
|
||||||
# if 'file' in args :
|
|
||||||
# DATA = pd.read_csv(args['file']) ;
|
|
||||||
# schema = []
|
|
||||||
# else:
|
|
||||||
# DATA = Components().get(args)
|
|
||||||
# client = bq.Client.from_service_account_json(args["private_key"])
|
|
||||||
# schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
|
|
||||||
|
|
||||||
# COLUMNS = DATA.columns
|
|
||||||
# DATA = np.array_split(DATA,PART_SIZE)
|
|
||||||
# args['schema'] = schema
|
|
||||||
GPU_CHIPS = args['gpu'] if 'gpu' in args else None
|
GPU_CHIPS = args['gpu'] if 'gpu' in args else None
|
||||||
if GPU_CHIPS and type(GPU_CHIPS) != list :
|
if GPU_CHIPS and type(GPU_CHIPS) != list :
|
||||||
GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS]
|
GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS]
|
||||||
|
@ -550,50 +580,6 @@ if __name__ == '__main__' :
|
||||||
# Let us see if we have partitions given the log folder
|
# Let us see if we have partitions given the log folder
|
||||||
|
|
||||||
content = os.listdir( os.sep.join([args['logs'],'train',args['context']]))
|
content = os.listdir( os.sep.join([args['logs'],'train',args['context']]))
|
||||||
|
|
||||||
|
|
||||||
# if ''.join(content).isnumeric() :
|
|
||||||
# #
|
|
||||||
# # we have partitions we are working with
|
|
||||||
|
|
||||||
# jobs = []
|
|
||||||
|
|
||||||
# # columns = DATA.columns.tolist()
|
|
||||||
|
|
||||||
# # DATA = np.array_split(DATA,PART_SIZE)
|
|
||||||
|
|
||||||
# for index in range(0,PART_SIZE) :
|
|
||||||
# if 'focus' in args and int(args['focus']) != index :
|
|
||||||
# #
|
|
||||||
# # This handles failures/recoveries for whatever reason
|
|
||||||
# # If we are only interested in generating data for a given partition
|
|
||||||
# continue
|
|
||||||
# # index = id.index(id)
|
|
||||||
|
|
||||||
# args['partition'] = index
|
|
||||||
# args['data'] = DATA[index]
|
|
||||||
# if int(args['num_gpu']) > 1 :
|
|
||||||
# args['gpu'] = index
|
|
||||||
# else:
|
|
||||||
# args['gpu']=0
|
|
||||||
|
|
||||||
# make = lambda _args: (Components()).generate(_args)
|
|
||||||
# job = Process(target=make,args=(args,))
|
|
||||||
# job.name = 'generator # '+str(index)
|
|
||||||
# job.start()
|
|
||||||
# jobs.append(job)
|
|
||||||
# # if len(jobs) == 1 :
|
|
||||||
# # job.join()
|
|
||||||
|
|
||||||
# print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ])
|
|
||||||
# while len(jobs)> 0 :
|
|
||||||
# jobs = [job for job in jobs if job.is_alive()]
|
|
||||||
# time.sleep(2)
|
|
||||||
|
|
||||||
# # generator.generate(args)
|
|
||||||
# else:
|
|
||||||
# generator.generate(args)
|
|
||||||
# Components.generate(args)
|
|
||||||
if 'all-chips' in SYS_ARGS and GPU_CHIPS:
|
if 'all-chips' in SYS_ARGS and GPU_CHIPS:
|
||||||
index = 0
|
index = 0
|
||||||
jobs = []
|
jobs = []
|
||||||
|
@ -625,7 +611,7 @@ if __name__ == '__main__' :
|
||||||
shuffler = Components()
|
shuffler = Components()
|
||||||
shuffler.shuffle(args)
|
shuffler.shuffle(args)
|
||||||
pass
|
pass
|
||||||
else:
|
elif 'train' in SYS_ARGS:
|
||||||
|
|
||||||
# DATA = np.array_split(DATA,PART_SIZE)
|
# DATA = np.array_split(DATA,PART_SIZE)
|
||||||
#
|
#
|
||||||
|
@ -657,10 +643,25 @@ if __name__ == '__main__' :
|
||||||
#
|
#
|
||||||
# If we have any obs we should wait till they finish
|
# If we have any obs we should wait till they finish
|
||||||
#
|
#
|
||||||
while len(jobs)> 0 :
|
DIRTY = 0
|
||||||
jobs = [job for job in jobs if job.is_alive()]
|
while len(jobs)> 0 :
|
||||||
time.sleep(2)
|
DIRTY =1
|
||||||
|
jobs = [job for job in jobs if job.is_alive()]
|
||||||
|
time.sleep(2)
|
||||||
|
if DIRTY:
|
||||||
|
print (["..:: jobs finished "])
|
||||||
|
#
|
||||||
|
# We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations
|
||||||
|
#
|
||||||
|
print (['finalize' in SYS_ARGS, ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) ])
|
||||||
|
if 'finalize' in SYS_ARGS or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) :
|
||||||
|
#
|
||||||
|
# We should pull all the primary keys and regenerate them in order to insure some form of consistency
|
||||||
|
#
|
||||||
|
|
||||||
|
(Components()).finalize(args)
|
||||||
|
# finalize(args)
|
||||||
|
pass
|
||||||
# jobs = []
|
# jobs = []
|
||||||
# for index in range(0,PART_SIZE) :
|
# for index in range(0,PART_SIZE) :
|
||||||
# if 'focus' in args and int(args['focus']) != index :
|
# if 'focus' in args and int(args['focus']) != index :
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -5,7 +5,7 @@ import sys
|
||||||
def read(fname):
|
def read(fname):
|
||||||
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
return open(os.path.join(os.path.dirname(__file__), fname)).read()
|
||||||
args = {"name":"data-maker",
|
args = {"name":"data-maker",
|
||||||
"version":"1.4.5",
|
"version":"1.4.6",
|
||||||
"author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
|
"author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
|
||||||
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
|
"packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
|
||||||
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
|
args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']
|
||||||
|
|
Loading…
Reference in New Issue