bug fix: matrix space restriction

2020-04-14 16:24:02 -05:00 · 2020-04-14 16:24:02 -05:00 · 8f390931f3
parent 1cf9c6e47a
commit 8f390931f3
3 changed files with 12 additions and 22 deletions
--- a/data/bridge.py
+++ b/data/bridge.py
@ -173,7 +173,7 @@ class Binary :
            # N = 
            i = np.random.choice(col_count,size)
            values = values[-i]
-            col_count = N
+            col_count = size
            

       
@ -209,7 +209,7 @@ class Binary :
            # N = 
            i = np.random.choice(col_count,size)
            values = values[-i]
-            col_count = N
+            col_count = size
        return values
 
    def _Export(self,df) :
@ -271,7 +271,7 @@ if __name__ == '__main__' :
    """
    df = pd.read_csv('sample.csv')
    print ( pd.get_dummies(df.race))
-    print ( (Binary()).apply(df.race, 30))
+    print ( (Binary()).apply(df.race, 2))

    # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
    # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()
--- a/data/maker/init.py
+++ b/data/maker/init.py
@ -136,7 +136,7 @@ def train (**args) :
            # print (df[col].dtypes)
            # print (df[col].dropna/(axis=1).unique())
        # args['real']  = pd.get_dummies(df[col].dropna()).astype(np.float32).values
-        msize = args['matrix_size'] if 'matrix_size' in args else -1
+        msize = args['matrix_size'] if 'matrix_size' in args else 128
        args['real'] = (Binary()).apply(df[col],msize)

            
@ -210,7 +210,7 @@ def generate(**args):
            
        # else:
        # values          = df[col].dropna().unique().tolist()
-        msize = args['matrix_size'] if 'matrix_size' in args else -1
+        msize = args['matrix_size'] if 'matrix_size' in args else 128
        values = bhandler.get_column_values(df[col])

        
--- a/pipeline.py
+++ b/pipeline.py
@ -73,21 +73,7 @@ class Components :
 		# @TODO: we need to log something here about the parameters being passed
 		# pointer  = args['reader'] if 'reader' in args else lambda: Components.get(**args)
 		df = args['data']
-		
-		if 'slice' in args and 'max_rows' in args['slice']:
-			max_rows = args['slice']['max_rows']
-			if df.shape[0] > max_rows :
-				print (".. slicing ")
-				i = np.random.choice(df.shape[0],max_rows,replace=False)
-				df = df.iloc[i]
-		
-		
-			#
-			# Certain columns need to be removed too large of a matrix
-			#
-		# if df.shape[0] == 0 :
-		# 	print ("CAN NOT TRAIN EMPTY DATASET ")
-		# 	return 
+
 		#
 		# Now we can parse the arguments and submit the entire thing to training
 		#
@ -102,8 +88,8 @@ class Components :
 		_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
 		if 'batch_size' in args :
 			_args['batch_size'] = int(args['batch_size'])
-			
-		#
+		
+		_args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128		#
 		# We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel
 		#
 		if int(args['num_gpu']) > 1 :
@ -157,6 +143,8 @@ class Components :
 		_args['num_gpu'] 	= 1
 		os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) 
 		_args['no_value']= args['no_value']
+		_args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128
+			
 		
 		# MAX_ROWS = args['max_rows']  	if 'max_rows' in args else 0
 		PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
@ -298,6 +286,8 @@ if __name__ == '__main__' :
 		args[key] = _config[key]
 	
 	args = dict(args,**SYS_ARGS)
+	if 'matrix_size' in args :
+		args['matrix_size'] = int(args['matrix_size'])
 	if 'batch_size' not in args :
 		args['batch_size']	= 2000 #if 'batch_size' not in args else int(args['batch_size'])
 	if 'dataset' not in args :