bug fix: finalize to remove duplicate keys

2021-04-28 16:47:38 -05:00 · 2021-04-28 16:47:38 -05:00 · 94798fd9a2
parent 3eb28dd798
commit 94798fd9a2
2 changed files with 63 additions and 62 deletions
--- a/pipeline.py
+++ b/pipeline.py
@ -268,7 +268,48 @@ class Components :
 		else:
 			writer.write(_df[columns],table=args['from'])
-	# @staticmethod
+	def finalize(self,args):
 		"""
 		This function performs post-processing opertions on a synthetic table i.e :
 			- remove duplicate keys
 			- remove orphaned keys i.e 
 		"""
 		reader = factory.instance(**args['store']['source'])
 		logger = factory.instance(**args['store']['logs'])
 		target = args['store']['target']['args']['dataset']
 		source = args['store']['source']['args']['dataset']
 		table = args['from']
 		schema = reader.meta(table=args['from'])
 		#
 		# keys :
 		unique_field = "_".join([args['from'],'id']) if 'unique_fields' not in args else args['unique_fields']
 		fields = [ item.name if item.name != unique_field else "y."+item.name for item in schema]
 		SQL = [
 			"SELECT :fields FROM ",
 			"(SELECT ROW_NUMBER() OVER() AS row_number,* FROM :target.:table) x","INNER JOIN",
 			"(SELECT ROW_NUMBER() OVER() AS row_number, :unique_field FROM :source.:table) y",
 			"ON y.row_number = x.row_number"
 		]
 		SQL = " ".join(SQL).replace(":fields",",".join(fields)).replace(":table",table).replace(":source",source).replace(":target",target)
 		SQL = SQL.replace(":unique_field",unique_field)
 		#
 		# Use a native job to get this done ...
 		#
 		client      = bq.Client.from_service_account_json(args['store']['source']['args']["private_key"])
 		job = bq.QueryJobConfig()
 		job.destination = client.dataset(target).table(table)
 		job.use_query_cache = True
 		job.allow_large_results = True
 		# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
 		job.write_disposition = "WRITE_TRUNCATE"
 		job.priority = 'BATCH'
 		r = client.query(SQL,location='US',job_config=job)		
 		logger.write({"job":r.job_id,"action":"finalize", "args":{"sql":SQL,"source":"".join([source,table]),"destimation":".".join([target,table])}})
 		#
 		# Keep a log of what just happened...
 		#
 		otable = ".".join([args['store']['source']['args']['dataset'],args['from']])
 		dtable = ".".join([args['store']['target']['args']['dataset'],args['from']])
 	def generate(self,args):
 		"""
 		This function will generate data and store it to a given,
@ -527,18 +568,7 @@ if __name__ == '__main__' :
 	# @TODO:
 	#	Log what was initiated so we have context of this processing ...
 	#
 	# if 'listen' not in SYS_ARGS :
 	# if 'file' in args :
 	# 	DATA = pd.read_csv(args['file']) ;
 	# 	schema = []
 	# else:
 	# 	DATA = Components().get(args)
 	# 	client      = bq.Client.from_service_account_json(args["private_key"])
 	# 	schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema
 	# COLUMNS = DATA.columns
 	# DATA = np.array_split(DATA,PART_SIZE)
 	# args['schema'] = schema
 	GPU_CHIPS = args['gpu'] if 'gpu' in args else None
 	if GPU_CHIPS and type(GPU_CHIPS) != list :				
 		GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS]
@ -550,50 +580,6 @@ if __name__ == '__main__' :
 		# Let us see if we have partitions given the log folder
 		content = os.listdir( os.sep.join([args['logs'],'train',args['context']]))
 		# if ''.join(content).isnumeric() :
 		# 	#
 		# 	# we have partitions we are working with
 		# 	jobs = []
 		# 	# columns = DATA.columns.tolist()
 		# 	# DATA  = np.array_split(DATA,PART_SIZE)
 		# 	for index in range(0,PART_SIZE) :
 		# 		if 'focus' in args and int(args['focus']) != index :
 		# 			#
 		# 			# This handles failures/recoveries for whatever reason
 		# 			# If we are only interested in generating data for a given partition 
 		# 			continue
 		# 		# index = id.index(id)
 		# 		args['partition'] = index
 		# 		args['data'] = DATA[index]
 		# 		if int(args['num_gpu']) > 1 :
 		# 			args['gpu'] = index
 		# 		else:
 		# 			args['gpu']=0
 		# 		make = lambda _args: (Components()).generate(_args)
 		# 		job = Process(target=make,args=(args,))
 		# 		job.name = 'generator # '+str(index)
 		# 		job.start()
 		# 		jobs.append(job)
 		# 		# if len(jobs) == 1 :
 		# 		# 	job.join()
 		# 	print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ])
 		# 	while len(jobs)> 0 :
 		# 		jobs = [job for job in jobs if job.is_alive()]
 		# 		time.sleep(2)
 		# 		# generator.generate(args)
 		# else:
 		# 	generator.generate(args)
 		# Components.generate(args)
 		if 'all-chips' in SYS_ARGS and GPU_CHIPS:
 			index = 0
 			jobs = []
@ -625,7 +611,7 @@ if __name__ == '__main__' :
 			shuffler = Components()
 			shuffler.shuffle(args)
 		pass
-	else:
+	elif 'train' in SYS_ARGS:
 		# DATA  = np.array_split(DATA,PART_SIZE)
 		#
@ -657,10 +643,25 @@ if __name__ == '__main__' :
 		#
 		# If we have any obs we should wait till they finish
 		#
-		while len(jobs)> 0 :
+	DIRTY = 0
-			jobs = [job for job in jobs if job.is_alive()]
+	while len(jobs)> 0 :
-			time.sleep(2)
+		DIRTY =1
 		jobs = [job for job in jobs if job.is_alive()]
 		time.sleep(2)
 	if DIRTY:
 		print (["..:: jobs finished "])
 	#
 	# We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations
 	#
 	print (['finalize' in SYS_ARGS, ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) ])
 	if 'finalize' in SYS_ARGS  or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) :
 		#
 		# We should pull all the primary keys and regenerate them in order to insure some form of consistency
 		#
 		(Components()).finalize(args)
 		# finalize(args)
 		pass
 		# jobs = []
 		# for index in range(0,PART_SIZE) :
 		# 	if 'focus' in args and int(args['focus']) != index :
--- a/setup.py
+++ b/setup.py
@ -5,7 +5,7 @@ import sys
 def read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.5",
+        "version":"1.4.6",
        "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
        "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']