bug fix: zeros matrix and continuous variables

2021-04-01 12:14:51 -05:00 · 2021-04-01 12:14:51 -05:00 · e0601edea5
parent 20ee62178a
commit e0601edea5
3 changed files with 71 additions and 11 deletions
--- a/data/maker/prepare/init.py
+++ b/data/maker/prepare/init.py
@ -16,6 +16,9 @@ import numpy as np
 # import cupy as cp
 import sys
 import os
+#
+# The following is to address the issue over creating a large matrix ...
+# 
 # from multiprocessing import Process, Queue

 # if 'GPU' in os.environ :
@ -230,8 +233,11 @@ class Input :
        cols = np.array(cols)
        row_count = len(rows)
        # if 'GPU' not in os.environ :
-        _matrix = np.zeros([row_count,cols.size])
-        
+        # _matrix = np.zeros([row_count,cols.size],dtype=int)
+        #
+        # @NOTE: For some reason, there is an out of memory error created here, this seems to fix it (go figure)
+        #
+        _matrix = np.array([np.zeros(cols.size) for i in np.arange(row_count)])
        [np.put(_matrix[i], np.where(cols ==  rows[i])  ,1)for i in np.arange(row_count) if np.where(cols == rows[i])[0].size > 0]
        # else:
        #     _matrix = cp.zeros([row_count,cols.size])
--- a/pipeline.py
+++ b/pipeline.py
@ -122,10 +122,20 @@ class Components :
 		_args = copy.deepcopy(args)
 		# _args['store']  = args['store']['source']
 		_args['data'] = df
+		#
+		# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
+		if 'continuous' in args :
+			x_cols = args['continuous']
+		else:
+			x_cols = []
+
 		if 'ignore' in args and 'columns' in args['ignore'] :
 			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
 			_args['data'] = df[ list(set(df.columns)- set(_cols))]
-		
+		#
+		# We need to make sure that continuous columns are removed 
+		if x_cols :
+			_args['data'] = df[list(set(df.columns) - set(x_cols))]
 		data.maker.train(**_args)
 		
 		if 'autopilot' in ( list(args.keys())) :
@ -136,7 +146,26 @@ class Components :

 		pass

-	def post(self,args):
+	def approximate(self,values):
+		"""
+		:param values	array of values to be approximated
+		"""
+		if values.dtype in [int,float] :
+			r = np.random.dirichlet(values)
+			x = []
+			_type = values.dtype
+			for index in np.arange(values.size) :
+				
+				if np.random.choice([0,1],1)[0] :
+					value = values[index] + (values[index] * r[index])
+				else :
+					value = values[index] - (values[index] * r[index])
+				value = int(value) if _type == int else np.round(value,2)
+				x.append( value)
+			np.random.shuffle(x)
+			return np.array(x)
+		else:
+			return values
 		pass
 			

@ -179,10 +208,23 @@ class Components :
 		_dc = pd.DataFrame()
 		# for mdf in df :
 		args['data'] = df
+		#
+		# The columns that are continuous should also be skipped because they don't need to be synthesied (like-that)
+		if 'continuous' in args :
+			x_cols = args['continuous']
+		else:
+			x_cols = []
+
 		if 'ignore' in args and 'columns' in args['ignore'] :
 			_cols = self.get_ignore(data=df,columns=args['ignore']['columns'])
 			args['data'] = df[ list(set(df.columns)- set(_cols))]
-
+		#
+		# We need to remove the continuous columns from the data-frame
+		# @TODO: Abstract this !!
+		#
+		if x_cols :
+			args['data'] = df[list(set(df.columns) - set(x_cols))]
+		
 		args['candidates']	= 1 if 'candidates' not in args else int(args['candidates'])
 		
 		candidates = (data.maker.generate(**args))
@ -192,7 +234,10 @@ class Components :
 			_columns = None
 			skip_columns = []
 			_schema = schema
-			cols = [_item['name'] for _item in _schema]
+			if schema :
+				cols = [_item['name'] for _item in _schema]
+			else:
+				cols = df.columns
 			for _df in candidates :
 				#
 				# we need to format the fields here to make sure we have something cohesive
@ -206,6 +251,9 @@ class Components :
 							# 	for _name in _df.columns:
 							# 		if _name in name:
 							# 			skip_columns.append(_name)
+				if x_cols :
+					for _col in x_cols :
+						_df[_col] = self.approximate(df[_col])
 				#
 				# We perform a series of set operations to insure that the following conditions are met:
 				#	- the synthetic dataset only has fields that need to be synthesized
@ -222,10 +270,16 @@ class Components :
 				# Let us merge the dataset here and and have a comprehensive dataset

 				_df = pd.DataFrame.join(df,_df)
-				for _item in _schema :
-					if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
-						_df[_item['name']] = _df[_item['name']].astype(str)
-				writer.write(_df[cols],schema=_schema,table=args['from'])
+				if _schema :
+					for _item in _schema :
+						if _item['type'] in ['DATE','TIMESTAMP','DATETIME'] :
+							_df[_item['name']] = _df[_item['name']].astype(str)
+				
+						pass
+				if _schema :
+					writer.write(_df[cols],schema=_schema,table=args['from'])
+				else:
+					writer.write(_df[cols],table=args['from'])
 			# 	writer.write(df,table=table)
 			pass
 		else:
--- a/setup.py
+++ b/setup.py
@ -5,7 +5,7 @@ import sys
 def read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read() 
 args = {"name":"data-maker",
-        "version":"1.4.3",
+        "version":"1.4.4",
        "author":"Vanderbilt University Medical Center","author_email":"steve.l.nyemba@vanderbilt.edu","license":"MIT",
        "packages":find_packages(),"keywords":["healthcare","data","transport","protocol"]}
 args["install_requires"] = ['data-transport@git+https://dev.the-phi.com/git/steve/data-transport.git','tensorflow==1.15','pandas','pandas-gbq','pymongo']