bug fix: finalize to remove duplicate keys

2021-04-28 16:47:38 -05:00 · 2021-04-28 16:47:38 -05:00 · 94798fd9a2
parent 3eb28dd798
commit 94798fd9a2
2 changed files with 63 additions and 62 deletions
--- a/pipeline.py
+++ b/pipeline.py
@ -268,7 +268,48 @@ class Components :
 		else:
 			writer.write(_df[columns],table=args['from'])

-	# @staticmethod
+	def finalize(self,args):
+		"""
+		This function performs post-processing opertions on a synthetic table i.e :
+			- remove duplicate keys
+			- remove orphaned keys i.e 
+		"""
+		reader = factory.instance(**args['store']['source'])
+		logger = factory.instance(**args['store']['logs'])
+		target = args['store']['target']['args']['dataset']
+		source = args['store']['source']['args']['dataset']
+		table = args['from']
+		schema = reader.meta(table=args['from'])
+		#
+		# keys :
+		unique_field = "_".join([args['from'],'id']) if 'unique_fields' not in args else args['unique_fields']
+		fields = [ item.name if item.name != unique_field else "y."+item.name for item in schema]
+		SQL = [
+			"SELECT :fields FROM ",
+			"(SELECT ROW_NUMBER() OVER() AS row_number,* FROM :target.:table) x","INNER JOIN",
+			"(SELECT ROW_NUMBER() OVER() AS row_number, :unique_field FROM :source.:table) y",
+			"ON y.row_number = x.row_number"
+		]
+		SQL = " ".join(SQL).replace(":fields",",".join(fields)).replace(":table",table).replace(":source",source).replace(":target",target)
+		SQL = SQL.replace(":unique_field",unique_field)
+		#
+		# Use a native job to get this done ...
+		#
+		client      = bq.Client.from_service_account_json(args['store']['source']['args']["private_key"])
+		job = bq.QueryJobConfig()
+		job.destination = client.dataset(target).table(table)
+		job.use_query_cache = True
+		job.allow_large_results = True
+		# job.time_partitioning = bq.table.TimePartitioning(type_=bq.table.TimePartitioningType.DAY)
+		job.write_disposition = "WRITE_TRUNCATE"
+		job.priority = 'BATCH'
+		r = client.query(SQL,location='US',job_config=job)		
+		logger.write({"job":r.job_id,"action":"finalize", "args":{"sql":SQL,"source":"".join([source,table]),"destimation":".".join([target,table])}})
+		#
+		# Keep a log of what just happened...
+		#
+		otable = ".".join([args['store']['source']['args']['dataset'],args['from']])
+		dtable = ".".join([args['store']['target']['args']['dataset'],args['from']])
 	def generate(self,args):
 		"""
 		This function will generate data and store it to a given,
@ -527,18 +568,7 @@ if __name__ == '__main__' :
 	# @TODO:
 	#	Log what was initiated so we have context of this processing ...
 	#
-	# if 'listen' not in SYS_ARGS :
-	# if 'file' in args :
-	# 	DATA = pd.read_csv(args['file']) ;
-	# 	schema = []
-	# else:
-	# 	DATA = Components().get(args)
-	# 	client      = bq.Client.from_service_account_json(args["private_key"])
-	# 	schema = client.get_table(client.dataset(args['dataset']).table(args['from'])).schema

-	# COLUMNS = DATA.columns
-	# DATA = np.array_split(DATA,PART_SIZE)
-	# args['schema'] = schema
 	GPU_CHIPS = args['gpu'] if 'gpu' in args else None
 	if GPU_CHIPS and type(GPU_CHIPS) != list :				
 		GPU_CHIPS = [int(_id.strip()) for _id in GPU_CHIPS.split(',')] if type(GPU_CHIPS) == str else [GPU_CHIPS]
@ -550,50 +580,6 @@ if __name__ == '__main__' :
 		# Let us see if we have partitions given the log folder
 		
 		content = os.listdir( os.sep.join([args['logs'],'train',args['context']]))
-		
-		
-		# if ''.join(content).isnumeric() :
-		# 	#
-		# 	# we have partitions we are working with
-			
-		# 	jobs = []
-			
-		# 	# columns = DATA.columns.tolist()
-			
-		# 	# DATA  = np.array_split(DATA,PART_SIZE)
-
-		# 	for index in range(0,PART_SIZE) :
-		# 		if 'focus' in args and int(args['focus']) != index :
-		# 			#
-		# 			# This handles failures/recoveries for whatever reason
-		# 			# If we are only interested in generating data for a given partition 
-		# 			continue
-		# 		# index = id.index(id)
-				
-		# 		args['partition'] = index
-		# 		args['data'] = DATA[index]
-		# 		if int(args['num_gpu']) > 1 :
-		# 			args['gpu'] = index
-		# 		else:
-		# 			args['gpu']=0
-
-		# 		make = lambda _args: (Components()).generate(_args)
-		# 		job = Process(target=make,args=(args,))
-		# 		job.name = 'generator # '+str(index)
-		# 		job.start()
-		# 		jobs.append(job)
-		# 		# if len(jobs) == 1 :
-		# 		# 	job.join()
-
-		# 	print (["Started ",len(jobs),"generators" if len(jobs)>1 else "generator" ])
-		# 	while len(jobs)> 0 :
-		# 		jobs = [job for job in jobs if job.is_alive()]
-		# 		time.sleep(2)
-				
-		# 		# generator.generate(args)
-		# else:
-		# 	generator.generate(args)
-		# Components.generate(args)
 		if 'all-chips' in SYS_ARGS and GPU_CHIPS:
 			index = 0
 			jobs = []
@ -625,7 +611,7 @@ if __name__ == '__main__' :
 			shuffler = Components()
 			shuffler.shuffle(args)
 		pass
-	else:
+	elif 'train' in SYS_ARGS:
 		
 		# DATA  = np.array_split(DATA,PART_SIZE)
 		#
@ -657,10 +643,25 @@ if __name__ == '__main__' :
 		#
 		# If we have any obs we should wait till they finish
 		#
-		while len(jobs)> 0 :
-			jobs = [job for job in jobs if job.is_alive()]
-			time.sleep(2)
-
+	DIRTY = 0
+	while len(jobs)> 0 :
+		DIRTY =1
+		jobs = [job for job in jobs if job.is_alive()]
+		time.sleep(2)
+	if DIRTY:
+		print (["..:: jobs finished "])
+	#
+	# We need to harmonize the keys if any at all in this case we do this for shuffle or generate operations
+	#
+	print (['finalize' in SYS_ARGS, ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) ])
+	if 'finalize' in SYS_ARGS  or ('generate' in SYS_ARGS or 'shuffle' in SYS_ARGS) :
+		#
+		# We should pull all the primary keys and regenerate them in order to insure some form of consistency
+		#
+		
+		(Components()).finalize(args)
+		# finalize(args)
+		pass
 		# jobs = []
 		# for index in range(0,PART_SIZE) :
 		# 	if 'focus' in args and int(args['focus']) != index :
--- a/setup.py
+++ b/setup.py
@ -5,7 +5,7 @@ import sys
 def read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.5",
+        "version":"1.4.6",
        "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
        "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']