gpu indexing
This commit is contained in:
parent
732ccb42e5
commit
5a16e325ac
|
@ -81,7 +81,6 @@ class ContinuousToDiscrete :
|
||||||
return values
|
return values
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def train (**_args):
|
def train (**_args):
|
||||||
"""
|
"""
|
||||||
:params sql
|
:params sql
|
||||||
|
@ -126,7 +125,7 @@ def train (**_args):
|
||||||
args['matrix_size'] = _matrix.shape[0]
|
args['matrix_size'] = _matrix.shape[0]
|
||||||
args['batch_size'] = 2000
|
args['batch_size'] = 2000
|
||||||
args['partition'] = 0 if 'partition' not in _args else _args['partition']
|
args['partition'] = 0 if 'partition' not in _args else _args['partition']
|
||||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
|
# os.environ['CUDA_VISIBLE_DEVICES'] = str(args['gpu']) if 'gpu' in args else '0'
|
||||||
|
|
||||||
trainer = gan.Train(**args)
|
trainer = gan.Train(**args)
|
||||||
#
|
#
|
||||||
|
@ -215,8 +214,7 @@ def generate(**_args):
|
||||||
_inputhandler = prepare.Input(**_args)
|
_inputhandler = prepare.Input(**_args)
|
||||||
values,_matrix = _inputhandler.convert()
|
values,_matrix = _inputhandler.convert()
|
||||||
args['values'] = np.array(values)
|
args['values'] = np.array(values)
|
||||||
if 'gpu' in _args :
|
|
||||||
os.environ['CUDA_VISIBLE_DEVICES'] = str(_args['gpu'])
|
|
||||||
handler = gan.Predict (**args)
|
handler = gan.Predict (**args)
|
||||||
handler.load_meta(None)
|
handler.load_meta(None)
|
||||||
#
|
#
|
||||||
|
@ -226,76 +224,3 @@ def generate(**_args):
|
||||||
candidates = handler.apply(candidates=args['candidates'])
|
candidates = handler.apply(candidates=args['candidates'])
|
||||||
return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]
|
return [_inputhandler.revert(matrix=_matrix) for _matrix in candidates]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _generate(**args):
|
|
||||||
"""
|
|
||||||
This function will generate a synthetic dataset on the basis of a model that has been learnt for the dataset
|
|
||||||
@return pandas.DataFrame
|
|
||||||
|
|
||||||
:data data-frame to be synthesized
|
|
||||||
:column columns that need to be synthesized (discrete)
|
|
||||||
:id column identifying an entity
|
|
||||||
:logs location on disk where the learnt knowledge of the dataset is
|
|
||||||
"""
|
|
||||||
# df = args['data']
|
|
||||||
df = args['data'] if not isinstance(args['data'],str) else pd.read_csv(args['data'])
|
|
||||||
|
|
||||||
CONTINUOUS = args['continuous'] if 'continuous' in args else []
|
|
||||||
column = args['column'] if (isinstance(args['column'],list)) else [args['column']]
|
|
||||||
# column_id = args['id']
|
|
||||||
#
|
|
||||||
#@TODO:
|
|
||||||
# If the identifier is not present, we should fine a way to determine or make one
|
|
||||||
#
|
|
||||||
BIN_SIZE = 4 if 'bin_size' not in args else int(args['bin_size'])
|
|
||||||
# NO_VALUE = dict(args['no_value']) if type(args['no_value']) == dict else args['no_value']
|
|
||||||
bhandler = Binary()
|
|
||||||
_df = df.copy()
|
|
||||||
for col in column :
|
|
||||||
args['context'] = col
|
|
||||||
args['column'] = col
|
|
||||||
|
|
||||||
msize = args['matrix_size'] if 'matrix_size' in args else -1
|
|
||||||
values = bhandler.get_column(df[col],msize)
|
|
||||||
MISSING= bhandler.get_missing(df[col],msize)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
args['values'] = values
|
|
||||||
args['row_count'] = df.shape[0]
|
|
||||||
# if col in NO_VALUE :
|
|
||||||
# args['no_value'] = NO_VALUE[col]
|
|
||||||
# else:
|
|
||||||
# args['no_value'] = NO_VALUE
|
|
||||||
# novalue = NO_VALUE[col] if NO_VALUE[col] in ['na',''] else NO_VALUE[col]
|
|
||||||
# MISSING += [NO_VALUE[col]]
|
|
||||||
args['missing'] = MISSING
|
|
||||||
#
|
|
||||||
# we can determine the cardinalities here so we know what to allow or disallow
|
|
||||||
handler = gan.Predict (**args)
|
|
||||||
handler.load_meta(col)
|
|
||||||
r = handler.apply()
|
|
||||||
if col in CONTINUOUS :
|
|
||||||
r[col] = np.array(r[col])
|
|
||||||
_approx = ContinuousToDiscrete.continuous(r[col],BIN_SIZE) #-- approximating based on arbitrary bins
|
|
||||||
r[col] = _approx
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
_df[col] = r[col]
|
|
||||||
#
|
|
||||||
# Let's cast the type to the original type (it makes the data more usable)
|
|
||||||
#
|
|
||||||
# print (values)
|
|
||||||
# print ([col,df[col].dtype,_df[col].tolist()])
|
|
||||||
otype = df[col].dtype
|
|
||||||
_df[col] = _df[col].astype(otype)
|
|
||||||
|
|
||||||
#
|
|
||||||
# @TODO: log basic stats about the synthetic attribute
|
|
||||||
#
|
|
||||||
# print (r)s
|
|
||||||
# break
|
|
||||||
|
|
||||||
return _df
|
|
11
pipeline.py
11
pipeline.py
|
@ -81,7 +81,12 @@ class Components :
|
||||||
terms = _args['columns']
|
terms = _args['columns']
|
||||||
return [name for name in _df.columns if np.sum( [int(field in name )for field in terms ]) > 0 ]
|
return [name for name in _df.columns if np.sum( [int(field in name )for field in terms ]) > 0 ]
|
||||||
return []
|
return []
|
||||||
|
def set_gpu(self,**_args) :
|
||||||
|
if 'gpu' in _args :
|
||||||
|
gpu = _args['gpu'] if type(_args['gpu']) != str else [_args['gpu']]
|
||||||
|
_index = str(gpu[0])
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = _index
|
||||||
|
return gpu
|
||||||
def train(self,**args):
|
def train(self,**args):
|
||||||
"""
|
"""
|
||||||
This function will perform training on the basis of a given pointer that reads data
|
This function will perform training on the basis of a given pointer that reads data
|
||||||
|
@ -137,7 +142,7 @@ class Components :
|
||||||
if x_cols :
|
if x_cols :
|
||||||
_args['data'] = df[list(set(df.columns) - set(x_cols))]
|
_args['data'] = df[list(set(df.columns) - set(x_cols))]
|
||||||
if 'gpu' in args :
|
if 'gpu' in args :
|
||||||
_args['gpu'] = args['gpu']
|
_args['gpu'] = self.set_gpu(gpu=args['gpu'])
|
||||||
data.maker.train(**_args)
|
data.maker.train(**_args)
|
||||||
|
|
||||||
if 'autopilot' in ( list(args.keys())) :
|
if 'autopilot' in ( list(args.keys())) :
|
||||||
|
@ -228,6 +233,8 @@ class Components :
|
||||||
args['data'] = df[list(set(df.columns) - set(x_cols))]
|
args['data'] = df[list(set(df.columns) - set(x_cols))]
|
||||||
|
|
||||||
args['candidates'] = 1 if 'candidates' not in args else int(args['candidates'])
|
args['candidates'] = 1 if 'candidates' not in args else int(args['candidates'])
|
||||||
|
if 'gpu' in args :
|
||||||
|
args['gpu'] = self.set_gpu(gpu=args['gpu'])
|
||||||
|
|
||||||
candidates = (data.maker.generate(**args))
|
candidates = (data.maker.generate(**args))
|
||||||
if 'sql.BQWriter' in ostore['type'] :
|
if 'sql.BQWriter' in ostore['type'] :
|
||||||
|
|
Loading…
Reference in New Issue