bug fix: matrix space restriction

This commit is contained in:
Steve Nyemba 2020-04-14 16:24:02 -05:00
parent 1cf9c6e47a
commit 8f390931f3
3 changed files with 12 additions and 22 deletions

View File

@ -173,7 +173,7 @@ class Binary :
# N = # N =
i = np.random.choice(col_count,size) i = np.random.choice(col_count,size)
values = values[-i] values = values[-i]
col_count = N col_count = size
@ -209,7 +209,7 @@ class Binary :
# N = # N =
i = np.random.choice(col_count,size) i = np.random.choice(col_count,size)
values = values[-i] values = values[-i]
col_count = N col_count = size
return values return values
def _Export(self,df) : def _Export(self,df) :
@ -271,7 +271,7 @@ if __name__ == '__main__' :
""" """
df = pd.read_csv('sample.csv') df = pd.read_csv('sample.csv')
print ( pd.get_dummies(df.race)) print ( pd.get_dummies(df.race))
print ( (Binary()).apply(df.race, 30)) print ( (Binary()).apply(df.race, 2))
# has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys() # has_basic = 'dataset' in SYS_ARGS.keys() and 'table' in SYS_ARGS.keys() and 'key' in SYS_ARGS.keys()
# has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys() # has_action= 'export' in SYS_ARGS.keys() or 'pseudo' in SYS_ARGS.keys()

View File

@ -136,7 +136,7 @@ def train (**args) :
# print (df[col].dtypes) # print (df[col].dtypes)
# print (df[col].dropna/(axis=1).unique()) # print (df[col].dropna/(axis=1).unique())
# args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values # args['real'] = pd.get_dummies(df[col].dropna()).astype(np.float32).values
msize = args['matrix_size'] if 'matrix_size' in args else -1 msize = args['matrix_size'] if 'matrix_size' in args else 128
args['real'] = (Binary()).apply(df[col],msize) args['real'] = (Binary()).apply(df[col],msize)
@ -210,7 +210,7 @@ def generate(**args):
# else: # else:
# values = df[col].dropna().unique().tolist() # values = df[col].dropna().unique().tolist()
msize = args['matrix_size'] if 'matrix_size' in args else -1 msize = args['matrix_size'] if 'matrix_size' in args else 128
values = bhandler.get_column_values(df[col]) values = bhandler.get_column_values(df[col])

View File

@ -73,21 +73,7 @@ class Components :
# @TODO: we need to log something here about the parameters being passed # @TODO: we need to log something here about the parameters being passed
# pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args) # pointer = args['reader'] if 'reader' in args else lambda: Components.get(**args)
df = args['data'] df = args['data']
if 'slice' in args and 'max_rows' in args['slice']:
max_rows = args['slice']['max_rows']
if df.shape[0] > max_rows :
print (".. slicing ")
i = np.random.choice(df.shape[0],max_rows,replace=False)
df = df.iloc[i]
#
# Certain columns need to be removed too large of a matrix
#
# if df.shape[0] == 0 :
# print ("CAN NOT TRAIN EMPTY DATASET ")
# return
# #
# Now we can parse the arguments and submit the entire thing to training # Now we can parse the arguments and submit the entire thing to training
# #
@ -102,8 +88,8 @@ class Components :
_args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs']) _args['max_epochs'] = 150 if 'max_epochs' not in args else int(args['max_epochs'])
if 'batch_size' in args : if 'batch_size' in args :
_args['batch_size'] = int(args['batch_size']) _args['batch_size'] = int(args['batch_size'])
# _args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128 #
# We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel # We ask the process to assume 1 gpu given the system number of GPU and that these tasks can run in parallel
# #
if int(args['num_gpu']) > 1 : if int(args['num_gpu']) > 1 :
@ -157,6 +143,8 @@ class Components :
_args['num_gpu'] = 1 _args['num_gpu'] = 1
os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu'])
_args['no_value']= args['no_value'] _args['no_value']= args['no_value']
_args['matrix_size'] = args['matrix_size'] if 'matrix_size' in args else 128
# MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0 # MAX_ROWS = args['max_rows'] if 'max_rows' in args else 0
PART_SIZE = int(args['part_size']) if 'part_size' in args else 8 PART_SIZE = int(args['part_size']) if 'part_size' in args else 8
@ -298,6 +286,8 @@ if __name__ == '__main__' :
args[key] = _config[key] args[key] = _config[key]
args = dict(args,**SYS_ARGS) args = dict(args,**SYS_ARGS)
if 'matrix_size' in args :
args['matrix_size'] = int(args['matrix_size'])
if 'batch_size' not in args : if 'batch_size' not in args :
args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size']) args['batch_size'] = 2000 #if 'batch_size' not in args else int(args['batch_size'])
if 'dataset' not in args : if 'dataset' not in args :